Background

This file is designed to use CDC data to assess coronavirus disease burden by state, including creating and analyzing state-level cluters.

Through March 7, 2021, The COVID Tracking Project collected and integrated data on tests, cases, hospitalizations, deaths, and the like by state and date. The latest code for using this data is available in Coronavirus_Statistics_CTP_v004.Rmd.

The COVID Tracking Project suggest that US federal data sources are now sufficiently robust to be used for analyses that previously relied on COVID Tracking Project. This code is an attempt to update modules in Coronavirus_Statistics_CTP_v004.Rmd to leverage US federal data.

The code leverages tidyverse and a variable mapping file throughout:

# All functions assume that tidyverse and its components are loaded and available
# Other functions are declared in the sourcing files or use library::function()
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# For future use, source key modules from a separate .R file

# Create a variable mapping file - this is not yest updated for federal data
varMapper <- c("cases"="Cases",
               "tot_cases"="Total cases",
               "newCases"="Increase in cases, most recent 30 days",
               "casesroll7"="Rolling 7-day mean cases", 
               "deaths"="Deaths", 
               "tot_deaths"="Total deaths",
               "newDeaths"="Increase in deaths, most recent 30 days",
               "deathsroll7"="Rolling 7-day mean deaths", 
               "cpm"="Cases per million",
               "cpm7"="Cases per day (7-day rolling mean) per million", 
               "tcpm"="Total cases per million",
               "tcpm7"="Total cases per million",
               "newcpm"="Increase in cases, most recent 30 days, per million",
               "dpm"="Deaths per million", 
               "dpm7"="Deaths per day (7-day rolling mean) per million", 
               "tdpm"="Total deaths per million",
               "tdpm7"="Total deaths per million", 
               "newdpm"="Increase in deaths, most recent 30 days, per million", 
               "hpm7"="Currently Hospitalized per million (7-day rolling mean)", 
               "tpm"="Tests per million", 
               "tpm7"="Tests per million per day (7-day rolling mean)"
               )

# Helper functions used early in the process
# Function for saving an R object to RDS, including a check for whether the object already exists
saveToRDS <- function(obj, 
                      file=paste0(deparse(substitute(obj)), ".RDS"), 
                      dir="./RInputFiles/Coronavirus/", 
                      ovrWrite=FALSE, 
                      ovrWriteError=TRUE,
                      makeReadOnly=TRUE
                      ) {
    
    # FUNCTION ARGUMENTS:
    # obj: the R object to save
    # file: the file name to save as
    # dir: the directory to save in (file path will be paste0(dir, file))
    # ovrWrite: boolean, should the file be overwritten if it already exists?
    # ovrWriteError: boolean, should an error be thrown if an attempt is made to overwrite the file?
    # makeReadOnly: boolean, should the output file be made read-only?
    
    # Create the file name
    locFile <- paste0(dir, file)
    
    # Check if the file already exists and proceed as per options
    if (file.exists(locFile)) {
        cat("\nFile already exists:", locFile, "\n")
        if (!ovrWrite & ovrWriteError) stop("\nAborting due to ovrWrite=FALSE and ovrWriteError=TRUE")
        if (!ovrWrite) {
            cat("\nNot replacing the existing file since ovrWrite=FALSE\n")
            return(NULL)
        }
    }
    
    # Save the file and update the permissions to read-only (if flag is set)
    saveRDS(obj, file=locFile)
    if (makeReadOnly) Sys.chmod(locFile, mode="0555", use_umask = FALSE)
    
}



# Function for reading an R object from RDS
readFromRDS <- function(file, 
                        dir="./RInputFiles/Coronavirus/", 
                        addSuffix=".RDS", 
                        deparseSub=FALSE
                        ) {
    
    # FUNCTION ARGUMENTS:
    # file: the file name to read in
    # dir: the directory the file is in
    # addSuffix: the suffix that should be added to file (file path will be paste0(dir, file, addSuffix))
    # deparseSub: whether to deparse and substitute file (use it as the text name)
    
    # Convert file if needed
    if (deparseSub) file <- deparse(substitute(file))
    
    # Ensure that file is of type character
    if (!isTRUE(all.equal(class(file), "character"))) {
        stop("\nUnable to read since file is not a character\n")
    }
    
    # Create the file name
    locFile <- paste0(dir, file, addSuffix)
    
    # Read the file (will be the return)
    readRDS(locFile)
    
}

# Function for summing numeric variables, with NA+NA=NA and NA+number=number
specNASum <- function(x) if (any(!is.na(x))) sum(x, na.rm=TRUE) else NA

Downloading and Integrating US Federal Data

Cases and Deaths by State-Date

CDC case and death data by state and date are available for download on the CDC website. The previously written function downloadCOVIDbyState() can be used to acquire the data by updating the API:

# NO CHANGES MADE TO FUNCTION - default API is from COVID Tracking Project
# Function to download data for COVID Tracking Project
downloadCOVIDbyState <- function(fileName, 
                                 api="https://api.covidtracking.com/v1/states/daily.csv", 
                                 ovrWrite=FALSE
                                 ) {
    
    # FUNCTION ARGUMENTS:
    # fileName: the filename that the data will be saved to
    # api: The API link for data downloads
    # ovrWrite: whether to allow overwriting of the existing fileName
    
    # Check whether fileName already exists
    if (file.exists(fileName)) {
        cat("\nFile already exists at:", fileName, "\n")
        if (ovrWrite) cat("Will over-write with current data from", api, "\n")
        else stop("Exiting due to ovrWrite=FALSE and a duplicate fileName\n")
    }
    
    # Download the file 
    download.file(api, destfile=fileName)
    
    # Show statistics on downloaded file
    file.info(fileName)
    
}

The data are downloaded and the process cached to avoid repeated hits against the CDC website:

downloadCOVIDbyState("./RInputFiles/Coronavirus/CDC_dc_downloaded_210414.csv",
                     api="https://data.cdc.gov/api/views/9mfq-cb36/rows.csv?accessType=DOWNLOAD", 
                     ovrWrite=FALSE
                     )
##                                                           size isdir mode
## ./RInputFiles/Coronavirus/CDC_dc_downloaded_210414.csv 2180440 FALSE  666
##                                                                      mtime
## ./RInputFiles/Coronavirus/CDC_dc_downloaded_210414.csv 2021-04-14 08:57:22
##                                                                      ctime
## ./RInputFiles/Coronavirus/CDC_dc_downloaded_210414.csv 2021-04-14 08:57:19
##                                                                      atime exe
## ./RInputFiles/Coronavirus/CDC_dc_downloaded_210414.csv 2021-04-14 08:57:22  no

Key fields from the documentation include:

  • submission_date - date of counts
  • state - state (includes 50 states, DC, NYC, and 8 territories/federations)
  • tot_cases - cumulative number of cases (confirmed and probable)
  • new_case - new cases (confirmed and probable)
  • tot_death - cumulative number of deaths (confirmed and probable)
  • new_death - new deaths (confirmed and probable)
  • consent_cases - boolean tracked as “Agree” (confirmed and probable tracked separately) or “Not Agree” (only total tracked)
  • consent_deaths - boolean tracked as “Agree” or “Not Agree” (same as for consent_cases)

Basic formatting and QC is run on the downloaded data (this can later be converted to functional form):

# Read and glimpse downloaded CDC file
cdcRaw <- readr::read_csv("./RInputFiles/Coronavirus/CDC_dc_downloaded_210414.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   submission_date = col_character(),
##   state = col_character(),
##   tot_cases = col_double(),
##   conf_cases = col_double(),
##   prob_cases = col_double(),
##   new_case = col_double(),
##   pnew_case = col_double(),
##   tot_death = col_double(),
##   conf_death = col_double(),
##   prob_death = col_double(),
##   new_death = col_double(),
##   pnew_death = col_double(),
##   created_at = col_character(),
##   consent_cases = col_character(),
##   consent_deaths = col_character()
## )
glimpse(cdcRaw)
## Rows: 26,820
## Columns: 15
## $ submission_date <chr> "01/01/2021", "04/30/2020", "02/26/2020", "03/05/20...
## $ state           <chr> "FL", "IA", "UT", "GA", "WV", "NY", "GA", "TN", "AS...
## $ tot_cases       <dbl> 1300528, 7145, 0, 2, 1224, 346492, 28196, 143937, 0...
## $ conf_cases      <dbl> NA, NA, NA, NA, 1224, NA, 28182, 141000, NA, NA, NA...
## $ prob_cases      <dbl> NA, NA, NA, NA, 0, NA, 14, 2937, NA, NA, NA, NA, 15...
## $ new_case        <dbl> 0, 302, 0, -5, 29, 5775, 602, 1854, 0, 0, 7598, 55,...
## $ pnew_case       <dbl> 6063, 0, NA, NA, 0, 0, 1, 38, 0, 0, 0, 0, 781, 705,...
## $ tot_death       <dbl> 21673, 162, 0, 0, 50, 10117, 1258, 1567, 0, 0, 1117...
## $ conf_death      <dbl> NA, NA, NA, NA, NA, NA, 1258, 1527, NA, NA, NA, NA,...
## $ prob_death      <dbl> NA, NA, NA, NA, NA, NA, 0, 40, NA, NA, NA, NA, 5080...
## $ new_death       <dbl> 0, 14, 0, 0, 0, 56, 47, 4, 0, 0, 229, 3, 60, 10, 3,...
## $ pnew_death      <dbl> 7, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 6, 2, 1, 478,...
## $ created_at      <chr> "01/02/2021 02:50:51 PM", "05/01/2020 09:00:19 PM",...
## $ consent_cases   <chr> "Not agree", "Not agree", "Agree", "Agree", "Agree"...
## $ consent_deaths  <chr> "Not agree", "Not agree", "Agree", "Agree", "Not ag...
# Check for variables in the consent_ fields
cdcRaw %>% 
    count(consent_cases, consent_deaths)
## # A tibble: 10 x 3
##    consent_cases consent_deaths     n
##    <chr>         <chr>          <int>
##  1 Agree         Agree          12487
##  2 Agree         N/A              447
##  3 Agree         Not agree       1788
##  4 N/A           Agree           1341
##  5 N/A           N/A              894
##  6 N/A           Not agree        447
##  7 Not agree     Agree           1370
##  8 Not agree     N/A              447
##  9 Not agree     Not agree       5364
## 10 <NA>          <NA>            2235
# Function to convert N/A, Agree, and Not agree to boolean
cdcToBool <- function(x) {
    y <- case_when(is.na(x) ~ "NA", 
                   x=="N/A" ~ "NA", 
                   x=="Agree" ~ "TRUE", 
                   x=="Not agree" ~ "FALSE", 
                   TRUE ~ "Problem"
                   )
    if (sum(y=="Problem") != 0) stop("Problem with the boolean conversion")
    y[y=="NA"] <- NA
    as.logical(y)
}

# Format fields as desired types
cdcProcessed <- cdcRaw %>%
    mutate(date=lubridate::mdy(submission_date), 
           creation_date=lubridate::mdy_hms(created_at), 
           bool_c_cases=cdcToBool(consent_cases), 
           bool_c_deaths=cdcToBool(consent_deaths)
           )
glimpse(cdcProcessed)
## Rows: 26,820
## Columns: 19
## $ submission_date <chr> "01/01/2021", "04/30/2020", "02/26/2020", "03/05/20...
## $ state           <chr> "FL", "IA", "UT", "GA", "WV", "NY", "GA", "TN", "AS...
## $ tot_cases       <dbl> 1300528, 7145, 0, 2, 1224, 346492, 28196, 143937, 0...
## $ conf_cases      <dbl> NA, NA, NA, NA, 1224, NA, 28182, 141000, NA, NA, NA...
## $ prob_cases      <dbl> NA, NA, NA, NA, 0, NA, 14, 2937, NA, NA, NA, NA, 15...
## $ new_case        <dbl> 0, 302, 0, -5, 29, 5775, 602, 1854, 0, 0, 7598, 55,...
## $ pnew_case       <dbl> 6063, 0, NA, NA, 0, 0, 1, 38, 0, 0, 0, 0, 781, 705,...
## $ tot_death       <dbl> 21673, 162, 0, 0, 50, 10117, 1258, 1567, 0, 0, 1117...
## $ conf_death      <dbl> NA, NA, NA, NA, NA, NA, 1258, 1527, NA, NA, NA, NA,...
## $ prob_death      <dbl> NA, NA, NA, NA, NA, NA, 0, 40, NA, NA, NA, NA, 5080...
## $ new_death       <dbl> 0, 14, 0, 0, 0, 56, 47, 4, 0, 0, 229, 3, 60, 10, 3,...
## $ pnew_death      <dbl> 7, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 6, 2, 1, 478,...
## $ created_at      <chr> "01/02/2021 02:50:51 PM", "05/01/2020 09:00:19 PM",...
## $ consent_cases   <chr> "Not agree", "Not agree", "Agree", "Agree", "Agree"...
## $ consent_deaths  <chr> "Not agree", "Not agree", "Agree", "Agree", "Not ag...
## $ date            <date> 2021-01-01, 2020-04-30, 2020-02-26, 2020-03-05, 20...
## $ creation_date   <dttm> 2021-01-02 14:50:51, 2020-05-01 21:00:19, 2020-03-...
## $ bool_c_cases    <lgl> FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, ...
## $ bool_c_deaths   <lgl> FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE,...
# Get control totals by state for numeric fields
cdcControl <- cdcProcessed %>%
    group_by(state) %>%
    summarize_if(is.numeric, .funs=function(x) sum(x, na.rm=TRUE))

# Plot control totals by state for numeric fields
for (keyVar in names(cdcControl)[2:ncol(cdcControl)]) {
    p1 <- cdcControl %>%
        select_at(vars(all_of(c("state", keyVar)))) %>%
        purrr::set_names(c("state", "y")) %>%
        ggplot(aes(x=fct_reorder(state, y), y=y)) + 
        geom_col(fill="lightblue") + 
        geom_text(aes(y=y/2, label=format(y, big.mark=",")), size=3, hjust=0) +
        labs(x="", y="", title=paste0("Control totals by state for ", keyVar)) + 
        coord_flip()
    print(p1)
}

Not every state reports on every metric. In particular, some jurisdictions break cases and deaths in to probable and confirmed while others do not. All states appear to report both total (cumulative) and new case. Comparisons of these fields are run, since restatements of history can lead to total and cumsum(new) being different:

# Check for alignment of total and sum(new)
cdcMism <- cdcProcessed %>%
    arrange(state, date) %>%
    group_by(state) %>%
    mutate(ck_tot_case=cumsum(ifelse(is.na(new_case), 0, new_case)), 
           ck_tot_death=cumsum(ifelse(is.na(new_death), 0, new_death))
           ) %>%
    select(date, state, tot_cases, ck_tot_case, tot_death, ck_tot_death, everything()) %>%
    mutate(mism_case=tot_cases != ck_tot_case, mism_death=tot_death != ck_tot_death) %>%
    group_by(state) %>%
    summarize(mism_case=sum(mism_case), mism_death=sum(mism_death), .groups="drop") %>%
    filter(mism_death > 0 | mism_case > 0) %>%
    arrange(-mism_case, -mism_death)
cdcMism
## # A tibble: 15 x 3
##    state mism_case mism_death
##    <chr>     <int>      <int>
##  1 TX          163          0
##  2 AL           41          0
##  3 MN           35         35
##  4 CA           34         47
##  5 MT            4          4
##  6 AZ            4          0
##  7 IN            1         68
##  8 NYC           0        363
##  9 NJ            0        292
## 10 OH            0         60
## 11 WV            0         32
## 12 MI            0         31
## 13 KY            0         26
## 14 MS            0         19
## 15 NC            0         19

Many states have data that are aligned throughout. In some states, there are differences between the total and new fields that should be explored further. A function is written to assess mismatches in the data:

# Function to report totals and plot trends by state
assessMismatch <- function(keyStates, 
                           keyMetric="cases",
                           df=cdcProcessed
                           ) {
    
    # FUNCTION ARGUMENTS
    # keyStates: states to be explored for differences
    # keyMetric: metric to be explored
    # df: the data frame containing data by state-date
    
    # Create a main database for comparisons
    dfUse <- cdcProcessed %>%
        arrange(state, date) %>%
        group_by(state) %>%
        mutate(ck_tot_case=cumsum(ifelse(is.na(new_case), 0, new_case)), 
               ck_tot_death=cumsum(ifelse(is.na(new_death), 0, new_death)), 
               d_cases=ck_tot_case-tot_cases,
               d_deaths=ck_tot_death-tot_death
               ) %>%
        select(date, 
               state, 
               cases=tot_cases, 
               ck_cases=ck_tot_case, 
               d_cases,
               deaths=tot_death, 
               ck_deaths=ck_tot_death, 
               d_deaths,
               everything()
               )
    
    # Show the discrepancies in the final data for each state
    dfUse %>%
        filter(state %in% all_of(keyStates)) %>%
        group_by(state) %>%
        filter(row_number()==n()) %>% 
        select(date, state, cases, ck_cases, d_cases, deaths, ck_deaths, d_deaths) %>%
        print()
    
    # Create plot of metric evolution
    dfPlot <- dfUse %>%
        filter(state %in% all_of(keyStates)) %>%
        select(date, state, cases, ck_cases, d_cases, deaths, ck_deaths, d_deaths) %>%
        pivot_longer(-c(date, state)) %>%
        filter(str_detect(name, keyMetric)) 
    p1 <- dfPlot %>%
        filter(!str_detect(name, "d_")) %>%
        mutate(useName=ifelse(str_detect(name, "ck_"), "cumsum(new)", "reported total")) %>%
        ggplot(aes(x=date, y=value)) + 
        geom_line(aes(group=useName, color=useName)) +
        labs(x="", 
             y="Cumulative reported", 
             title=paste0("Discrepancies in cumulative total ", keyMetric)
             ) +
        scale_color_discrete("Source") +
        facet_wrap(~state, scales="free_y")
    print(p1)
    p2 <- dfPlot %>%
        filter(str_detect(name, "d_")) %>%
        ggplot(aes(x=date, y=value)) + 
        geom_line(aes(group=state, color=state)) +
        labs(x="", 
             y="Cumulative discrepancy", 
             title=paste0("Discrepancies in cumulative total ", keyMetric)
             ) +
        scale_color_discrete("State")
    print(p2)
    
}

The function can then be applied to the case and death data with mismatches:

assessMismatch(keyStates=cdcMism %>% filter(mism_case > 0) %>% pull(state), keyMetric="cases")
## # A tibble: 7 x 8
## # Groups:   state [7]
##   date       state   cases ck_cases d_cases deaths ck_deaths d_deaths
##   <date>     <chr>   <dbl>    <dbl>   <dbl>  <dbl>     <dbl>    <dbl>
## 1 2021-04-12 AL     519071   512950   -6121  10712     10712        0
## 2 2021-04-12 AZ     850236   849611    -625  17086     17086        0
## 3 2021-04-12 CA    3602827  3599149   -3678  59249     58443     -806
## 4 2021-04-12 IN     699823   699538    -285  13151     11644    -1507
## 5 2021-04-12 MN     544046   543155    -891   7037      6899     -138
## 6 2021-04-12 MT     106253   106181     -72   1524      1498      -26
## 7 2021-04-12 TX    2819529  2749901  -69628  48219     48219        0

assessMismatch(keyStates=cdcMism %>% filter(mism_death > 0) %>% pull(state), keyMetric="deaths")
## # A tibble: 12 x 8
## # Groups:   state [12]
##    date       state   cases ck_cases d_cases deaths ck_deaths d_deaths
##    <date>     <chr>   <dbl>    <dbl>   <dbl>  <dbl>     <dbl>    <dbl>
##  1 2021-04-12 CA    3602827  3599149   -3678  59249     58443     -806
##  2 2021-04-12 IN     699823   699538    -285  13151     11644    -1507
##  3 2021-04-12 KY     433352   433352       0   6257      5474     -783
##  4 2021-04-12 MI     830957   830957       0  17576     17220     -356
##  5 2021-04-12 MN     544046   543155    -891   7037      6899     -138
##  6 2021-04-12 MS     307836   307836       0   7119      7085      -34
##  7 2021-04-12 MT     106253   106181     -72   1524      1498      -26
##  8 2021-04-12 NC     935061   935061       0  12290     12222      -68
##  9 2021-04-12 NJ     955073   955073       0  24896     23072    -1824
## 10 2021-04-12 NYC    892028   892028       0  31909     27850    -4059
## 11 2021-04-12 OH    1041389  1041389       0  18827     15064    -3763
## 12 2021-04-12 WV     146462   146462       0   2745      2580     -165

The mismathes appear to arise at discrete points in time, likely reflecting a reclassification of many previous cases and deaths. The total field is always greater than or equal to the sum of the new field. This sugests using ‘new’ for shape of the curve and ‘total’ for overall disease burden.

A function is then written to rename and split columns appropriately:

# Create list of expected variables and renames (NA means keep as-is)
cdcVarNames <- c("date"=NA, 
                 "state"=NA, 
                 "tot_cases"=NA, 
                 "conf_cases"=NA, 
                 "prob_cases"=NA, 
                 "new_case"="new_cases", 
                 "pnew_case"="pnew_cases", 
                 "tot_death"="tot_deaths", 
                 "conf_death"="conf_deaths", 
                 "prob_death"="prob_deaths", 
                 "new_death"="new_deaths", 
                 "pnew_death"="pnew_deaths",
                 "bool_c_cases"="bool_cases", 
                 "bool_c_deaths"="bool_deaths"
                 )

# Function to rename variables, split, and pivot for easier analysis
renamePivotProcessed <- function(df, 
                                 selectRename=cdcVarNames
                                 ) {
    
    # FUNCTION ARGUMENTS
    # df: a processed CDC data file
    # selectRename: a list of variable -> new name (NA means keep as-is) as c('original'='new')
    
    # Check alignment of variables in df and selectRename
    dfNames <- names(df)
    selNames <- names(selectRename)
    
    # Create the vector of renamed variables after selection
    selRenames <- unname(selectRename[selNames])
    selRenames[is.na(selRenames)] <- selNames[is.na(selRenames)]
    
    # Names in one but not the other
    cat("\n*** Variables that will be dropped (not in selectRename vector) include:", 
        setdiff(dfNames, selNames), 
        sep="\n"
        )
    cat("\n\n*** Variables passed in selectRename that are not in the data include:", 
        setdiff(selNames, dfNames), 
        sep="\n"
        )
    
    # Select the key variables and rename
    df <- df %>%
        select_at(vars(all_of(selNames))) %>%
        purrr::set_names(selRenames)

    # Pivot and separate the data, keeping unique by date-state
    # Requires that selectRename have every variable as modifier_type
    df <- df %>%
        pivot_longer(-c(date, state)) %>%
        tidyr::separate(name, into=c("modifier", "metric"), sep="_")

    # Summary NA statistics of the new dataset
    cat("\nSummary statistics for processed and pivoted data\n")
    df %>%
        group_by(modifier, metric) %>%
        summarize(n=n(), nna=sum(is.na(value)), sum=sum(value, na.rm=TRUE), .groups="drop") %>%
        print()
    cat("\n")
    
    # Return the data frane
    df
    
}

The function is applied to create cdcPivotLong:

cdcPivotLong <- renamePivotProcessed(cdcProcessed)
## 
## *** Variables that will be dropped (not in selectRename vector) include:
## submission_date
## created_at
## consent_cases
## consent_deaths
## creation_date
## 
## 
## *** Variables passed in selectRename that are not in the data include:
## 
## 
## Summary statistics for processed and pivoted data
## # A tibble: 12 x 5
##    modifier metric     n   nna        sum
##    <chr>    <chr>  <int> <int>      <dbl>
##  1 bool     cases  26820  4917      14722
##  2 bool     deaths 26820  4023      15198
##  3 conf     cases  26820 14842 2067083705
##  4 conf     deaths 26820 14478   51520966
##  5 new      cases  26820     0   30995591
##  6 new      deaths 26820     0     546212
##  7 pnew     cases  26820  4923    3328224
##  8 pnew     deaths 26820  5102      41359
##  9 prob     cases  26820 14843  230283006
## 10 prob     deaths 26820 14478    5712572
## 11 tot      cases  26820     0 4503152844
## 12 tot      deaths 26820     0   96207327
glimpse(cdcPivotLong)
## Rows: 321,840
## Columns: 5
## $ date     <date> 2021-01-01, 2021-01-01, 2021-01-01, 2021-01-01, 2021-01-0...
## $ state    <chr> "FL", "FL", "FL", "FL", "FL", "FL", "FL", "FL", "FL", "FL"...
## $ modifier <chr> "tot", "conf", "prob", "new", "pnew", "tot", "conf", "prob...
## $ metric   <chr> "cases", "cases", "cases", "cases", "cases", "deaths", "de...
## $ value    <dbl> 1300528, NA, NA, 0, 6063, 21673, NA, NA, 0, 7, 0, 0, 7145,...

Next steps are to combine the NYS/NYC data as NY and to filter to the states and DC. A function is written to combine states (will be applied only to NY and NYC for these data):

# Function to combine states
combineStates <- function(df, 
                          mapper=comboStates,
                          sortBy=c("date", "state", "modifier", "metric"),
                          boolMod=c("bool")
                          ) {
    
    # FUNCTION ARGUMENTS:
    # df: a data frame resulting from renamePivotProcessed() that includes date-state-modifier-metric-value
    # mapper: a vector listing the states to be remapped
    # sortBy: order that the resulting file should be sorted (and unique) by
    # boolMod: modifier indicating value is a boolean rather than number (cannot be summed)
    
    # Split the file in to stand-alone and combined
    df <- df %>%
        mutate(combined=state %in% names(mapper))
    dfAlone <- df %>% filter(!combined)
    dfCombo <- df %>% filter(combined)
    
    # Process the combined file to give it the new state name
    # Group by the final sorting variables
    dfCombo <- dfCombo %>%
        mutate(state=unname(comboStates[state])) %>%
        group_by_at(vars(all_of(sortBy))) %>%
        arrange_at(vars(all_of(sortBy)))
    
    # Numeric variables should be summed, with NA+NA=NA and NA+number=number
    specNASum <- function(x) if (any(!is.na(x))) sum(x, na.rm=TRUE) else NA

    # Sum the value field, leave the combined field as-is
    dfCombo <- dfCombo %>%
        summarize(value=specNASum(value), combined=first(combined), .groups="drop")
    
    # Boolean variables should be output as NA since there is no sensible combining based on what it means
    dfCombo <- dfCombo %>%
        mutate(value=ifelse(modifier %in% all_of(boolMod), NA, value))
    
    # Combine the data, sort, check for uniqueness, and return
    dfAll <- bind_rows(dfCombo, dfAlone) %>%
        arrange_at(vars(all_of(sortBy)))
    if (nrow(distinct(dfAll %>% select_at(vars(all_of(sortBy))))) != nrow(dfAll)) stop("\nRow mismatch\n")
    dfAll
    
}

The function is then applied to the cdcPivotLong data:

# List of states to be changed 
# Also include any state that is mapped to itself with some other state mapping to it
# Format is 'currentState'='combinedState'
comboStates <- c("NYC"="NY", 
                 "NY"="NY"
                 )

# Run function
cdcAnalysis <- combineStates(cdcPivotLong)

# Check for prevalenece of NA by metric-modifier
cdcAnalysis %>%
    group_by(metric, modifier) %>%
    summarize(sumValue=sum(value, na.rm=TRUE), n=n(), nna=sum(is.na(value)), nComb=sum(combined))
## `summarise()` regrouping output by 'metric' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups:   metric [2]
##    metric modifier   sumValue     n   nna nComb
##    <chr>  <chr>         <dbl> <int> <int> <int>
##  1 cases  bool          14275 26373  5364   447
##  2 cases  conf     2067083705 26373 14395   447
##  3 cases  new        30995591 26373     0   447
##  4 cases  pnew        3328224 26373  4835   447
##  5 cases  prob      230283006 26373 14396   447
##  6 cases  tot      4503152844 26373     0   447
##  7 deaths bool          14751 26373  4470   447
##  8 deaths conf       51520966 26373 14031   447
##  9 deaths new          546212 26373     0   447
## 10 deaths pnew          41359 26373  5014   447
## 11 deaths prob        5712572 26373 14031   447
## 12 deaths tot        96207327 26373     0   447

The fields for tot/new by cases/deaths appear appropriate for further usage. Next steps are to compare the totals to existing data from the CTP (final date 2021-03-07).

The final CTP data are pulled, with cumulative data through 2021-03-07 calculated. Cumulative data through 2021-03-07 from the CDC file are also extracted:

# Read in existing data from CTP
ctp_hier6_210401 <- readFromRDS("ctp_hier6_210401")

# Totals through 2021-03-07 for CTP (final date of collection)
ctp_cum_210307 <- ctp_hier6_210401$dfFiltered %>%
    group_by(state) %>%
    summarize_if(is.numeric, sum, na.rm=TRUE) %>%
    pivot_longer(-c(state), names_to="metric") %>%
    mutate(modifier="ctp") %>%
    select(state, modifier, metric, value, everything())
ctp_cum_210307
## # A tibble: 204 x 4
##    state modifier metric   value
##    <chr> <chr>    <chr>    <dbl>
##  1 AK    ctp      cases    56886
##  2 AK    ctp      deaths     305
##  3 AK    ctp      hosp     17804
##  4 AK    ctp      tests  1731620
##  5 AL    ctp      cases   499819
##  6 AL    ctp      deaths   10148
##  7 AL    ctp      hosp    404951
##  8 AL    ctp      tests  2323788
##  9 AR    ctp      cases   324818
## 10 AR    ctp      deaths    5319
## # ... with 194 more rows
# Totals through 2021-03-07 for CDC (to match CTP data)
# Uses final tot-cases, final tot-deaths, cumsum new-cases, cumsum new-deaths
cdc_cum_210307 <- cdcAnalysis %>%
    filter(date <= as.Date("2021-03-07"), 
           metric %in% c("cases", "deaths"), 
           modifier %in% c("new", "tot"), 
           state %in% c(state.abb, "DC")
           ) %>%
    pivot_wider(c(state, date, metric), names_from="modifier", values_from="value") %>%
    group_by(state, metric) %>%
    summarize(new=sum(new, na.rm=TRUE), tot=last(tot, order_by=date), .groups="drop") %>%
    pivot_longer(-c(state, metric), names_to="modifier") %>%
    select(state, modifier, metric, value, everything())
cdc_cum_210307
## # A tibble: 204 x 4
##    state modifier metric  value
##    <chr> <chr>    <chr>   <dbl>
##  1 AK    new      cases   56886
##  2 AK    tot      cases   56886
##  3 AK    new      deaths    301
##  4 AK    tot      deaths    301
##  5 AL    new      cases  497705
##  6 AL    tot      cases  499819
##  7 AL    new      deaths  10148
##  8 AL    tot      deaths  10148
##  9 AR    new      cases  324818
## 10 AR    tot      cases  324818
## # ... with 194 more rows

Next steps are to write a function to pull a specific metric and plot the differences in a ‘base’ modifier and any other values for modifier.

A function is written to compare data from different sources:

# Function to assess variables new_x, tot_x, and x for percentage differences
checkStateCompare <- function(..., 
                              met="cases", 
                              baseModifier="ctp"
                              ) {
    
    # FUNCTION ARGUMENTS:
    # ...: one or more data frames of the form state-modifier-metric-value
    # met: the metric to be used, "cases" or "deaths"
    # baseModifier: value of the modifier field that signifies the baseline for comparison
    
    # Bind the data frames, and calculate as a percentage of the "base" modifier
    df <- bind_rows(...) %>%
        filter(metric %in% all_of(met)) %>%
        arrange(state, modifier) %>%
        group_by(state) %>%
        mutate(pct_of=value/sum(ifelse(modifier==all_of(baseModifier), value, 0))) %>%
        ungroup()

    # Plot the percentage differences for fields other than reference
    p1 <- df %>%
        filter(modifier != all_of(baseModifier)) %>%
        ggplot(aes(x=fct_reorder(state, pct_of), 
                   y=pct_of, 
                   color=modifier
                   )
               ) + 
        geom_point() + 
        coord_flip() + 
        labs(x="", 
             y=paste0("Reported ", met, " as function of baseline data from ", baseModifier), 
             title=paste0("Comparison to baseline data for ", met)
             ) + 
        scale_color_discrete("Comparison metric") + 
        geom_hline(yintercept=1, lty=2)
    print(p1)
    
    # Return the data frame
    df
    
}

The function can then be applied to the cases and deaths data:

# Run comparisons for cases and deaths
caseCompare <- checkStateCompare(ctp_cum_210307, cdc_cum_210307, met="cases")

deathCompare <- checkStateCompare(ctp_cum_210307, cdc_cum_210307, met="deaths")

# Print cases and deaths that are at least 3% different
caseCompare %>%
    filter(abs(1-pct_of) >= 0.03)
## # A tibble: 8 x 5
##   state modifier metric  value pct_of
##   <chr> <chr>    <chr>   <dbl>  <dbl>
## 1 HI    new      cases   27031  0.942
## 2 HI    tot      cases   27031  0.942
## 3 IA    new      cases  339574  1.20 
## 4 IA    tot      cases  339574  1.20 
## 5 MA    new      cases  568979  0.962
## 6 MA    tot      cases  568979  0.962
## 7 MO    new      cases  562129  1.17 
## 8 MO    tot      cases  562129  1.17
deathCompare %>%
    filter(abs(1-pct_of) >= 0.03)
## # A tibble: 9 x 5
##   state modifier metric value pct_of
##   <chr> <chr>    <chr>  <dbl>  <dbl>
## 1 IN    new      deaths 11230  0.882
## 2 NJ    new      deaths 21750  0.923
## 3 NY    new      deaths 44045  1.13 
## 4 NY    tot      deaths 48104  1.23 
## 5 OH    new      deaths 13893  0.787
## 6 OK    new      deaths  6598  1.46 
## 7 OK    tot      deaths  6598  1.46 
## 8 TX    new      deaths 46689  1.05 
## 9 TX    tot      deaths 46689  1.05

Data are well aligned between CTP and CDC as of 2021-03-07. Primary differences include:

  • CDC reports significantly greater cases for Iowa (+20%) and Missouri (+17%)
  • CDC reports modestly lower cases for Massachusetts (-4%) and Hawaii (-6%)
  • CDC reports significantly more deaths for Oklahoma (+46%) and New York (+13% on sum of ‘new’, +23% on ’tot)
  • CDC reports modestly more deaths for Texas (+5%)
  • CDC reports significantly lower deaths for Indiana (-11% using ‘new’), Ohio (-21% using ‘new’), and New Jersey (-8% using ‘new’), though sums of ‘tot’ for these states appear well aligned to the CTP data

In general, the CDC ‘tot’ field appears well aligned to the cumulative totals from CTP, and can likely be used on a go-forward as a measure of cumulative disease impact in each state.

Next steps are to check the curve evolutions, particularly since states with differences in ‘new’ and ‘tot’ have large corrections all reported at once.

A function is written to compare the shapes of the curves. Curve shape will be assessed as the percentage distribution of ‘new’ cases and deaths by time:

curvePercent <- function(df, 
                         keyMetric, 
                         keyModifiers=c("new", "ctp"), 
                         dateMax=as.Date("2021-03-07")
                         ) {
  
 
    # FUNCTION ARGUMENTS:
    # df: the data frame containing date-state-modifier-metric-value
    
    # Filter for the metric and modifiers of interest and only through dateMax
    df <- df %>%
        filter(metric %in% all_of(keyMetric), 
               modifier %in% all_of(keyModifiers), 
               date <= all_of(dateMax)
               )
    
    # Calculate the cumulative percentage by date
    df <- df %>%
        mutate(value=ifelse(is.na(value), 0, value)) %>%
        arrange(date) %>%
        group_by(state, metric, modifier) %>%
        mutate(pct=cumsum(value)/sum(value)) %>%
        ungroup()
    
    # Pivot for percentages by state-date-metric
    df <- df %>%
        pivot_wider(c(state, date, metric), names_from="modifier", values_from="pct") %>%
        mutate_at(vars(all_of(keyModifiers)), .funs=function(x) ifelse(is.na(x), 0, x))
    
    # Calculate RMSE for each state-metric (assumes only two columns - can expand later)
    stateRMSE <- df %>%
        group_by(state, metric) %>%
        summarize(rmse=sqrt(mean((get(keyModifiers[1])-get(keyModifiers[2]))**2)), .groups="drop")

    # Create plots of RMSE by state
    p1 <- stateRMSE %>%
        ggplot(aes(x=fct_reorder(state, rmse), y=rmse)) + 
        geom_point() +
        geom_text(aes(label=round(rmse, 3), y=rmse+0.00025), hjust=0, size=3) +
        coord_flip() + 
        labs(x="", 
             y="RMSE for cumulative percentage by date", 
             title=paste0("Curve comparison for ", keyMetric)
             )
    print(p1)
    
    # Return the data frame
    df
    
}

The CTP data are formatted for use with the function, integrated with cdcPivotLong, and assessed:

fullPivotLong <- ctp_hier6_210401$dfFiltered %>%
    pivot_longer(-c(date, state), names_to="metric") %>%
    mutate(modifier="ctp") %>%
    bind_rows(cdcPivotLong) %>%
    filter(state %in% c(state.abb, "DC"))

curveCases <- curvePercent(fullPivotLong, keyMetric="cases")

curveDeath <- curvePercent(fullPivotLong, keyMetric="deaths")

Many of the same locales that have differences between the ‘new’ and ‘total’ fields in CDC also have differences in curve shape with CTP. Next steps are to explore some of the larger differences.

A function is written to explore curves for a speciic metric and set of states:

exploreCurve <- function(df, 
                         keyMetric=NULL, 
                         keyStates=NULL
                         ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame containing state-date-metric-[1 column per metric type]
    # keyMetric: the metric to filter in column 'metric' (NULL means determine from data)
    # keyStates: the list of states to be plotted (NULL means pick the top 9 from data)
    
    # Get keyMetric if it is passed as NULL
    if (is.null(keyMetric)) {
        keyMetric <- df %>% count(metric) %>% arrange(-n) %>% head(1) %>% pull(metric)
        cat("\nKey metric will be: ", keyMetric)
    }
    
    # Get the numerical columns for plotting
    colPlot <- df %>% select_if(is.numeric) %>% names()
    cat("\nColumns to be plotted will be: ", colPlot, "\n")
    
    # Get keyStates if not passed
    if (is.null(keyStates)) {
        keyStates <- df %>%
            filter(metric==all_of(keyMetric)) %>%
            group_by(state) %>%
            summarize(rmse=sqrt(mean((get(colPlot[1])-get(colPlot[2]))**2)), .groups="drop") %>%
            arrange(-rmse) %>%
            head(9) %>%
            pull(state)
        cat("States to be plotted will be: ", keyStates, "\n")
    }
    
    # Create faceted plots for the requested metrics
    dfPlot <- df %>%
        filter(metric==all_of(keyMetric), state %in% all_of(keyStates)) %>%
        select_at(vars(all_of(c("state", "date", colPlot)))) %>%
        mutate(state=factor(state, levels=keyStates)) %>%
        pivot_longer(-c("state", "date")) %>%
        arrange(state, name, date) %>%
        group_by(state, name) %>%
        mutate(delta=ifelse(row_number()==1, value, value-lag(value, 1)))
    p1 <- dfPlot %>%
        ggplot(aes(x=date, y=value)) + 
        geom_line(aes(group=name, color=name)) + 
        facet_wrap(~state) + 
        labs(x="", 
             y="Cumulative percentage of total recorded by date", 
             title=paste0("Curve evolution by percentage for metric: ", keyMetric), 
             subtitle="Cumulative"
             ) + 
        scale_color_discrete("Data source")
    p2 <- dfPlot %>%
        ggplot(aes(x=date, y=delta)) + 
        geom_line(aes(group=name, color=name)) + 
        facet_wrap(~state, scales="free_y") + 
        labs(x="", 
             y="Incremental percentage of total recorded by date", 
             title=paste0("Curve evolution by percentage for metric: ", keyMetric), 
             subtitle="Incremental"
             ) + 
        scale_color_discrete("Data source")
    print(p1)
    print(p2)
    
}

The routine is then run for cases and deaths, with a larger plotting area:

exploreCurve(curveCases)
## 
## Key metric will be:  cases
## Columns to be plotted will be:  ctp new 
## States to be plotted will be:  NY OK IA MO HI MA SC WA GA

exploreCurve(curveDeath)
## 
## Key metric will be:  deaths
## Columns to be plotted will be:  ctp new 
## States to be plotted will be:  NY OH OK IN TX RI NJ DE SC

Visually, the shapes of the case curves are nearly overlapping, and segments based using shape will likely be the same whether using CTP or CDC data. The shapes of the death curves vary meaningfully in NY and modestly in OH, OK, and IN. While the different curve evolutions are unlikely to drive different segments, it is worth keeping an eye on (particularly for NY).

Next steps are to adapt the CDC data and existing CTP code so they can be used together.

Coding Steps for State Data

The main function used previously is readRunCOVIDTrackingProject(), which performs multiple tasks:

STEP 1: Extracts a file of population by state (by default uses 2015 population from usmap::statepop)
STEP 2a^: Downloads the latest data from COVID Tracking Project if requested
STEP 2b^: Reads in data from a specified local file (may have just been downloaded in step 2a), and checks control total trends against a previous version of the file
STEP 3^: Processed the loaded data file for keeping proper variables, dropping non-valid states, etc.
STEP 4^: Adds per-capita metrics for cases, deaths, tests, and hospitalizations
STEP 5: Adds existing clusters by state if passed as an argument to useClusters=, otherwise creates new segments based on user-defined parameters
STEP 6^^: Creates assessment plots for the state-level clusters
STEP 7^^: Creates consolidated plots of cases, hospitalizations, deaths, and tests
STEP 8^^: Optionally, creates plots of cumulative burden by segments and by state
STEP 9: Returns a list of key data frames, modeling objects, named cluster vectors, etc.

^ The user can instead specify a previously processed file and skip steps 2a, 2b, 3, and 4. The previously processed file needs to be formatted and filtered such that it can be used “as is”
^^ The user can skip the segment-level assessments by setting skipAssessmentPlots=TRUE

The main function and the helper functions were previously updated to allow for using 2021 data. The main function is copied below and will eventually be adapted for CDC daily data:

# Function to download/load, process, segment, and analyze data for CDC daily
# Needs to be updated
readRunCDCDaily <- function(thruLabel, 
                            downloadTo=NULL, 
                            readFrom=downloadTo, 
                            compareFile=NULL,
                            dateChangePlot=FALSE,
                            dateMetricPrint=TRUE,
                            writeLog=NULL,
                            ovrwriteLog=TRUE,
                            dfPerCapita=NULL,
                            useClusters=NULL,
                            hierarchical=TRUE,
                            returnList=!hierarchical, 
                            kCut=6,
                            reAssignState=vector("list", 0),
                            makeCumulativePlots=TRUE,
                            skipAssessmentPlots=FALSE,
                            ...
                            ) {
    
    # FUNCTION ARGUMENTS:
    # thruLabel: the label for when the data are through (e.g., "Aug 30, 2020")
    # donwloadTo: download the most recent CDC daily data to this location
    #             NULL means do not download any data
    # readFrom: location for reading in the CDC daily data (defaults to donwloadTo)
    # compareFile: name of the file to use for comparisons when reading in raw data (NULL means no comparison)
    # dateChangePlot: boolean, should changes in dates be captured as a plot rather than as a list?
    # dateMetricPrint: boolean, should the changes by date and metric be printed to the main log?
    # writeLog: name of a separate log file for capturing detailed data on changes between files
    #           NULL means no detailed data captured
    # ovrwriteLog: boolean, should the log file be overwritten and started again from scratch?
    # dfPerCapita: file can be passed directly, which bypasses the loading and processing steps
    # useClusters: file containing clusters by state (NULL means make the clusters from the data)
    # hierarchical: boolean, should hierarchical clusters be produced (if FALSE, will be k-means)?
    # returnList: boolean, should a list be returned or just the cluster object?
    #             refers to what is returned by clusterStates(); the main function always returns a list
    # kCut: number of segments when cutting the hierarchical tree
    # reAssignState: mapping file for assigning a state to another state's cluster
    #                format list("stateToChange"="stateClusterToAssign")
    # makeCumulativePlots: whether to make plots of cumulative metrics
    # skipAssessmentPlots: boolean to skip the plots for assessClusters()
    #                      especially useful if just exploring dendrograms or silhouette widths
    # ...: arguments to be passed to clusterStates(), will be used only if useClusters is NULL
    
    
    # STEP 1: Get state data
    stateData <- getStateData()
    
    # Helper function for glimpsing
    glimpseFile <- function(x, txt) {
        cat(txt)
        glimpse(x)
    }
            
    # STEPS 2-4 are run only if dfPerCapita does not exist
    if (is.null(dfPerCapita)) {
        
        # STEP 2a: Download latest CDC daily data (if requested)
        if (!is.null(downloadTo)) {
            downloadCOVIDbyState(fileName=downloadTo, 
                                 api="https://api.covidtracking.com/v1/states/daily.csv"
                                 )
        }
        
        # STEP 2b: Read-in CDC Daily Data
        dfRaw <- readCOViDbyState(readFrom, 
                                  checkFile=compareFile, 
                                  dateChangePlot=dateChangePlot, 
                                  dateMetricPrint=dateMetricPrint, 
                                  writeLog=writeLog, 
                                  ovrwriteLog=ovrwriteLog
                                  )
        if (is.null(writeLog)) glimpseFile(dfRaw, txt="\nRaw data file:\n")
        else capture.output(glimpseFile(dfRaw, txt="\nRaw data file:\n"), file=writeLog, append=TRUE)
        
        # STEP 3: Process the data so that it includes all requested key variables
        varsFilter <- c("date", "state", "positiveIncrease", "deathIncrease", 
                        "hospitalizedCurrently", "totalTestResultsIncrease"
                        )
        dfFiltered <- processCVData(dfRaw, 
                                    varsKeep=varsFilter, 
                                    varsRename=c(positiveIncrease="cases", 
                                                 deathIncrease="deaths", 
                                                 hospitalizedCurrently="hosp", 
                                                 totalTestResultsIncrease="tests"
                                    )
        )
        if (is.null(writeLog)) glimpseFile(dfFiltered, txt="\nFiltered data file:\n")
        else capture.output(glimpseFile(dfFiltered, txt="\nFiltered data file:\n"), file=writeLog, append=TRUE)
        
        # STEP 4: Convert to per capita
        dfPerCapita <- helperMakePerCapita(dfFiltered, 
                                           mapVars=c("cases"="cpm", "deaths"="dpm", 
                                                     "hosp"="hpm", "tests"="tpm"
                                           ), 
                                           popData=stateData
        )
        if (is.null(writeLog)) glimpseFile(dfPerCapita, txt="\nPer capita data file:\n")
        else capture.output(glimpseFile(dfPerCapita, txt="\nPer capita data file:\n"), 
                            file=writeLog, 
                            append=TRUE
                            )
        
    } else {
        dfRaw <- NULL
        dfFiltered <- NULL
    }
    
    
    # STEP 5: Create the clusters (if they have not been passed)
    if (is.null(useClusters)) {
        # Run the clustering process
        clData <- clusterStates(df=dfPerCapita, hierarchical=hierarchical, returnList=returnList, ...)
        # If hierarchical clusters, cut the tree, otherwise use the output object directly
        if (hierarchical) {
            useClusters <- cutree(clData, k=kCut)
        } else {
            useClusters <- clData$objCluster$cluster
        }
        # If requested, manually assign clusters to the cluster for another state
        for (xNum in seq_len(length(reAssignState))) {
            useClusters[names(reAssignState)[xNum]] <- useClusters[reAssignState[[xNum]]]
        }
        
    }
    
    
    # STEP 5a: Stop the process and return what is available if skipAssessmentPlots is TRUE
    if (skipAssessmentPlots) {
        return(list(stateData=stateData, 
                    dfRaw=dfRaw, 
                    dfFiltered=dfFiltered, 
                    dfPerCapita=dfPerCapita, 
                    useClusters=useClusters, 
                    plotData=NULL, 
                    consolidatedPlotData=NULL, 
                    clCum=NULL
                    )
               )
    }
    
    
    # STEP 6: Create the cluster assessments
    plotData <- assessClusters(useClusters, 
                               dfState=stateData, 
                               dfBurden=dfPerCapita,
                               thruLabel=thruLabel,
                               plotsTogether=TRUE
    )
    
    
    # STEP 7: Plot the consolidated metrics
    subT <- "Cases: new cases, Deaths: new deaths, Hosp: total in hospital (not new), Tests: new tests"
    consolidatedPlotData <- plotConsolidatedMetrics(plotData, 
                                                    varMain=c("state", "cluster", "date", "pop",
                                                              "cases", "deaths", "hosp", "tests"
                                                    ), 
                                                    subT=subT, 
                                                    nrowPlot2=2
    )
    
    # STEP 8: Create cumulative metrics if requested
    if (makeCumulativePlots) {
        consPos <- consolidatedPlotData %>%
            ungroup() %>%
            select(state, cluster, date, name, vpm7) %>%
            arrange(state, cluster, date, name) %>%
            pivot_wider(-vpm7, names_from="name", values_from="vpm7") %>%
            mutate(pctpos=cases/tests) %>%
            pivot_longer(-c(state, cluster, date), values_to="vpm7") %>%
            filter(!is.na(vpm7))
        clCum <- makeCumulative(consPos)
        plotCumulativeData(clCum, 
                           keyMetricp2="", 
                           flagsp2="", 
                           makep1=TRUE, 
                           makep2=FALSE
        )
        plotCumulativeData(clCum, 
                           keyMetricp2="deaths", 
                           flagsp2=findFlagStates(clCum, keyMetricVal = "deaths")
        )
        plotCumulativeData(clCum, 
                           keyMetricp2="cases", 
                           flagsp2=findFlagStates(clCum, keyMetricVal = "cases")
        )
        plotCumulativeData(clCum, 
                           keyMetricp2="tests", 
                           flagsp2=findFlagStates(clCum, keyMetricVal = "tests")
        )
    } else {
        clCum <- NULL
    }
    
    
    # STEP 9: Return a list of the key data
    list(stateData=stateData, 
         dfRaw=dfRaw, 
         dfFiltered=dfFiltered, 
         dfPerCapita=dfPerCapita, 
         useClusters=useClusters, 
         plotData=plotData, 
         consolidatedPlotData=consolidatedPlotData, 
         clCum=clCum
    )
    
    
}

The state data are downloaded and a per capita file created:

# Function to extract and format key state data
getStateData <- function(df=readFromRDS("statePop2019"), 
                         renameVars=c("stateAbb"="state", "NAME"="name", "pop_2019"="pop"), 
                         keepVars=c("state", "name", "pop")
                         ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame containing state data
    # renameVars: variables to be renamed, using named list with format "originalName"="newName"
    # keepVars: variables to be kept in the final file
    
    # Rename variables where appropriate
    names(df) <- ifelse(is.na(renameVars[names(df)]), names(df), renameVars[names(df)])
    
    # Return file with only key variables kept
    df %>%
        select_at(vars(all_of(keepVars)))
    
}

# Run getStateData() as a stand-alone
stateDataCDC <- getStateData()

# Create an analysis file, pivoted wider, for new and total cases/deaths
cdcFiltered <- cdcAnalysis %>%
    filter(metric %in% c("cases", "deaths"), modifier %in% c("new", "tot")) %>%
    mutate(key=paste(modifier, metric, sep="_")) %>%
    pivot_wider(c("date", "state"), names_from="key", values_from="value")


# Helper function to create per capita metrics
helperPerCapita <- function(df, 
                            origVar, 
                            newName,
                            byVar="state",
                            popVar="pop",
                            popData=stateData,
                            mult=1000000
                            ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame currently being processed
    # origVar: the variables to be converted to per capita
    # newName: the new per capita variable name
    # byVar: the variable that will be merged by
    # popVar: the name of the population variable in the popData file
    # popData: the file containing the population data
    # mult: the multiplier, so that the metric is "per mult people"
    
    # Create the per capita variable
    df %>%
        inner_join(select_at(popData, vars(all_of(c(byVar, popVar)))), by=byVar) %>%
        mutate(!!newName:=mult*get(origVar)/get(popVar)) %>%
        select(-all_of(popVar))
    
}


# Helper function to create rolling aggregates
helperRollingAgg <- function(df, 
                             origVar, 
                             newName,
                             func=zoo::rollmean,
                             k=7, 
                             fill=NA, 
                             ...
                             ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame containing the data
    # origVar: the original data column name
    # newName: the new variable column name
    # func: the function to be applied (zoo::rollmean will be by far the most common)
    # k: the periodicity (k=7 is rolling weekly data)
    # fill: how to fill leading.trailing data to maintain the same vector lengths
    # ...: any other arguments to be passed to func
    
    # Create the appropriate variable
    df %>%
        mutate(!!newName:=func(get(origVar), k=k, fill=fill, ...))
    
}


# Function to add per capita and rolling to the base data frame
helperMakePerCapita <- function(df, 
                                mapVars=c("cases"="cpm", "deaths"="dpm"),
                                popData=stateData,
                                k=7
                                ) {
    
    # FUNCTION ARGUMENTS:
    # df: the initial data frame for conversion
    # mapVars: named vector of variables to be converted 'original name'='converted name'
    # k: the rolling time period to use
    
    # Create the variables for per capita
    for (origVar in names(mapVars)) {
        df <- df %>% 
            helperPerCapita(origVar=origVar, newName=mapVars[origVar], popData=popData)
    }
    # Arrange the data by date in preparation for rolling aggregations
    df <- df %>% 
        group_by(state) %>% 
        arrange(date)

    # Create the rolling variables
    for (newVar in mapVars) {
        df <- df %>% 
            helperRollingAgg(origVar=newVar, newName=paste0(newVar, k), k=k)
    }
    
    # Return the updated data frame, ungrouped
    df %>%
        ungroup()
    
}

# Convert to per capita
cdcPerCapita <- helperMakePerCapita(cdcFiltered, 
                                    mapVars=c("new_cases"="cpm", "new_deaths"="dpm", 
                                              "tot_cases"="tcpm", "tot_deaths"="tdpm"
                                              ), 
                                    popData=stateDataCDC
                                    )
cdcPerCapita
## # A tibble: 22,797 x 14
##    date       state new_cases new_deaths tot_cases tot_deaths   cpm   dpm  tcpm
##    <date>     <chr>     <dbl>      <dbl>     <dbl>      <dbl> <dbl> <dbl> <dbl>
##  1 2020-01-22 AK            0          0         0          0     0     0     0
##  2 2020-01-22 AL            0          0         0          0     0     0     0
##  3 2020-01-22 AR            0          0         0          0     0     0     0
##  4 2020-01-22 AZ            0          0         0          0     0     0     0
##  5 2020-01-22 CA            0          0         0          0     0     0     0
##  6 2020-01-22 CO            0          0         0          0     0     0     0
##  7 2020-01-22 CT            0          0         0          0     0     0     0
##  8 2020-01-22 DC            0          0         0          0     0     0     0
##  9 2020-01-22 DE            0          0         0          0     0     0     0
## 10 2020-01-22 FL            0          0         0          0     0     0     0
## # ... with 22,787 more rows, and 5 more variables: tdpm <dbl>, cpm7 <dbl>,
## #   dpm7 <dbl>, tcpm7 <dbl>, tdpm7 <dbl>
cdcPerCapita %>%
    select(date, state, contains("7")) %>%
    pivot_longer(-c("date", "state")) %>%
    filter(!is.na(value)) %>%
    bind_rows(mutate(summarize(group_by(., date, name), value=median(value), .groups="drop"), 
                     state="med"
                     )
              ) %>%
    ggplot(aes(x=date, y=value)) + 
    geom_line(data=~filter(., state!="med"), aes(group=state), alpha=0.15) +
    geom_line(data=~filter(., state=="med"), aes(group=state), color="blue", size=1.5) + 
    labs(x="", y="Per million", title="Evolution metrics per capita by state", subtitle="Blue is median") +
    facet_wrap(~name, scales="free_y")

Per capita data appear broadly as expected. Next steps are to update the segmentation and clustering algorithms. The function call is copied, with the function then updated:

# Clustering code
# STEP 5: Create the clusters (if they have not been passed)
# if (is.null(useClusters)) {
#     # Run the clustering process
#     clData <- clusterStates(df=dfPerCapita, hierarchical=hierarchical, returnList=returnList, ...)
#     # If hierarchical clusters, cut the tree, otherwise use the output object directly
#     if (hierarchical) {
#         useClusters <- cutree(clData, k=kCut)
#     } else {
#         useClusters <- clData$objCluster$cluster
#     }
#     # If requested, manually assign clusters to the cluster for another state
#     for (xNum in seq_len(length(reAssignState))) {
#         useClusters[names(reAssignState)[xNum]] <- useClusters[reAssignState[[xNum]]]
#     }
# }

# Function to create an elbow plot for various numbers of clusters in the data
helperElbow <- function(mtx, 
                        testCenters, 
                        iter.max, 
                        nstart, 
                        silhouette=FALSE
                        ) {
    
    # FUNCTION ARGUMENTS:
    # mtx: a numeric matrix, or an object that can be coerced to a numeric matrix (no character fields)
    # testCenters: integer vector for the centers to be tested
    # iter.max: parameter passed to kmeans
    # nstart: parameter passed to kmeans
    # silhouette: whether to calculate the silhouette score
    
    # Create an object for storing tot.withinss and silhouetteScore
    totWithin <- vector("numeric", length(testCenters))
    silhouetteScore <- vector("numeric", length(testCenters))
    
    # Create the distancing data (required for silhouette score)
    if (silhouette) distData <- dist(mtx)
    
    # Run k-means for every value in testCenters, and store $tot.withinss (and silhouetteScore, if requested)
    n <- 1
    for (k in testCenters) {
        km <- kmeans(mtx, centers=k, iter.max=iter.max, nstart=nstart)
        totWithin[n] <- km$tot.withinss
        if (silhouette & (k > 1)) silhouetteScore[n] <- mean(cluster::silhouette(km$cluster, distData)[, 3])
        n <- n + 1
    }
    
    # Create the elbow plot
    p1 <- tibble::tibble(n=testCenters, wss=totWithin) %>%
        ggplot(aes(x=n, y=wss)) + 
        geom_point() + 
        geom_line() + 
        geom_text(aes(y=wss + 0.05*max(totWithin), x=n+0.2, label=round(wss, 1))) + 
        labs(x="Number of segments", y="Total Within Sum-Squares", title="Elbow plot") + 
        ylim(c(0, NA))
    
    # Create the silhouette plot if requested
    if (silhouette) {
        p2 <- tibble::tibble(n=testCenters, ss=silhouetteScore) %>%
            ggplot(aes(x=n, y=ss)) + 
            geom_point() + 
            geom_line() + 
            geom_text(aes(y=ss + 0.05*max(silhouetteScore), x=n+0.2, label=round(ss, 1))) + 
            labs(x="Number of segments", y="Mean silhouette width", title="Silhouette plot") + 
            ylim(c(-1, NA))
        gridExtra::grid.arrange(p1, p2, nrow=1)
    } else {
        print(p1)
    }
    
}

# Custom function for creating YYYY-MM for use as the shape of the curve function
customTimeBucket <- function(x) {
    paste0(lubridate::year(x), "-", stringr::str_pad(lubridate::month(x), width=2, side="left", pad="0"))
}

# Updates to the clustering function
clusterStates <- function(df, 
                          caseVar="cpm", 
                          deathVar="dpm",
                          totCaseVar=NULL, 
                          totDeathVar=NULL,
                          shapeFunc=customTimeBucket, 
                          minShape=NULL, 
                          maxShape=NULL,
                          minDeath=0,
                          maxDeath=Inf,
                          minCase=0,
                          maxCase=Inf,
                          ratioTotalvsShape=1, 
                          ratioDeathvsCase=1, 
                          hierarchical=TRUE, 
                          hierMethod="complete", 
                          nCenters=3, 
                          iter.max=10,
                          nstart=1,
                          testCenters=NULL,
                          returnList=FALSE, 
                          hmlSegs=3, 
                          eslSegs=2,
                          seed=NULL
                          ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame containing cases and deaths data
    # caseVar: the variable containing the daily cases per capita data
    # deathVar: the variable containing the daily deaths per capita data
    # totCaseVar: a variable containing total cases per capita (may differ from sum of new due to bulk adds)
    #             NULL means use sum(caseVar), otherwise use value of totCaseVar on last day of data
    # totDeathVar: a variable containing total deaths per capita (may differ from sum of new due to bulk adds)
    #             NULL means use sum(deathVar), otherwise use value of totDeathVar on last day of data
    # shapeFunc: the function to be used for creating the shape of the curve
    # minShape: the minimum value to be used for shape (to avoid very small amounts of data in Jan/Feb/Mar)
    #           shape is the month, so 4 means start with April data (NULL means keep everything)
    # maxShape: the maximum value to be used for shape (to avoid very small amounts of data in a partial month)
    #           shape is the month, so 9 means end with September data (NULL means keep everything)
    # minDeath: use this value as a floor for the death metric when calculating shape
    # maxDeath: use this value as a maximum when calculating distance using deaths 
    # minCase: use this value as a floor for the case metric when calculating shape
    # maxCase: use this value as a maximum when calculating distance using cases 
    # ratioTotalvsShape: amount of standard deviation to be kept in total variable vs shape variables
    # ratioDeathvsCase: amount of standard deviation to be kept in deaths vs cases 
    #                   (total death data will be scaled to have sd this many times higher than cases)
    #                   (death percentages by time period will be scaled directly by this amount)
    # hierarchical: whether to create hierarchical clusters
    #               TRUE means run hierarchical clustering
    #               FALSE means run kmeans clustering
    #               NA means run rules-based clustering
    # hierMethod: the method for hierarchical clustering (e.g., 'complete' or 'single')
    # nCenters: the number of centers to use for kmeans clustering
    # testCenters: integer vector of centers to test (will create an elbow plot); NULL means do not test
    # iter.max: maximumum number of kmeans iterations (default in kmeans algorithm is 10)
    # nstart: number of random sets chosen for kmeans (default in kmeans algorithm is 1)
    # returnList: boolean, if FALSE just the cluster object is returned
    #                      if TRUE, a list is returned with dfCluster and the cluster object
    # hmlSegs: number of segments to create for volume of burden integrated over time
    # eslSegs: number of segments to create for shape of burden over time
    # seed: set the seed to this value (NULL means no seed)

    # Create the timeBucket field, then filter the data to only the time periods of interest
    df <- df %>%
        mutate(timeBucket=shapeFunc(date))
    
    # Limit to only relevant time buckets if requested
    if (!is.null(minShape)) {
        df <- df %>%
            filter(timeBucket >= minShape)
    }
    if (!is.null(maxShape)) {
        df <- df %>%
            filter(timeBucket <= maxShape)
    }

    # Create an aggregate by state, scaled so that they have the proper ratio
    # If totCaseVar is NULL, use sum(cases), otherwise use max(cases)
    # If totDeathVar is NULL, use sum(cases), otherwise use max(cases)
    dfAgg <- df %>%
        group_by(state) %>%
        summarize(origTotalCases=if(is.null(totCaseVar)) sum(get(caseVar)) else max(get(totCaseVar)), 
                  origTotalDeaths=if(is.null(totDeathVar)) sum(get(deathVar)) else max(get(totDeathVar)), 
                  .groups="drop"
                  ) %>%
        mutate(totalCases=pmin(origTotalCases, maxCase), totalDeaths=pmin(origTotalDeaths, maxDeath)) %>%
        ungroup() %>%
        mutate(totalDeaths=ratioDeathvsCase*totalDeaths*sd(totalCases)/sd(totalDeaths)) %>%
        select(-origTotalCases, -origTotalDeaths)  # fields are just for QC while writing function

    # Create shape of the curve by state
    dfShape <- df %>%
        select_at(vars(all_of(c("timeBucket", "state", caseVar, deathVar)))) %>%
        purrr::set_names(c("timeBucket", "state", "cases", "deaths")) %>%
        group_by(state, timeBucket) %>%
        summarize_if(is.numeric, .funs=sum) %>%
        ungroup() %>%
        pivot_longer(-c(state, timeBucket)) %>%
        group_by(state, name) %>%
        mutate(tot=pmax(sum(value), ifelse(name=="deaths", minDeath, minCase)), 
               value=ifelse(name=="deaths", ratioDeathvsCase, 1) * value / tot) %>%
        select(-tot) %>%
        pivot_wider(state, names_from=c(name, timeBucket), values_from=value) %>%
        ungroup()
    
    # Function to calculate SD of a subset of columns
    calcSumSD <- function(df) {
        df %>% 
            ungroup() %>% 
            select(-state) %>% 
            summarize_all(.funs=sd) %>% 
            as.vector() %>% 
            sum()
    }
    
    # Down-weight the aggregate data so that there is the proper sum of sd in aggregates and shapes
    aggSD <- calcSumSD(dfAgg)
    shapeSD <- calcSumSD(dfShape)
    dfAgg <- dfAgg %>%
        mutate_if(is.numeric, ~. * ratioTotalvsShape * shapeSD / aggSD)
    
    # Combine so there is one row per state
    dfCluster <- dfAgg %>%
        inner_join(dfShape, by="state")
    
    # convert 'state' to rowname
    keyData <- dfCluster %>% 
        column_to_rownames("state")
    
    # Create rules-based segments (NA) or hierarchical segments (TRUE) or kmeans segments (FALSE)
    if (is.na(hierarchical)) {
        # Create pseudo-rules-based segments
        if (!is.null(seed)) set.seed(seed)
        # STEP 1: Classify high-medium-low based on deaths and cases
        hml <- kmeans(select(keyData, starts_with("total")), 
                      centers=hmlSegs, iter.max=iter.max, nstart=nstart
        )
        # STEP 2: Classify early-late based on shape
        esl <- kmeans(select(keyData, -starts_with("total")), 
                      centers=eslSegs, iter.max=iter.max, nstart=nstart
        )
        # STEP 3: Create a final segment
        objCluster <- eslSegs*(hml$cluster-1) + esl$cluster
    } else if (isTRUE(hierarchical)) {
        # Create hierarchical segments
        objCluster <-  hclust(dist(keyData), method=hierMethod)
        plot(objCluster)
    } else {
        # Create k-means segments
        # Create an elbow plot if testCenters is not NULL
        if (!is.null(testCenters)) {
            helperElbow(keyData, testCenters=testCenters, iter.max=iter.max, nstart=nstart, silhouette=TRUE)
        }
        # Create the kmeans cluster object, setting a seed if requested
        if (!is.null(seed)) set.seed(seed)
        objCluster <- kmeans(keyData, centers=nCenters, iter.max=iter.max, nstart=nstart)
        cat("\nCluster means and counts\n")
        n=objCluster$size %>% cbind(objCluster$centers) %>% round(2) %>% t() %>% print()
    }
    
    # Return the data and object is a list if returnList is TRUE, otherwise return only the clustering object
    if (!isTRUE(returnList)) {
        objCluster
    } else {
        list(objCluster=objCluster, dfCluster=dfCluster)
    }
    
}

The function can then be tested for various clustering schemes:

# Example for rules-based clustering
clRules <- clusterStates(df=cdcPerCapita, 
                         hierarchical=NA, 
                         returnList=TRUE, 
                         shapeFunc=customTimeBucket, 
                         minShape="2020-04", 
                         maxShape="2021-03", 
                         hmlSegs=3, 
                         eslSegs=3, 
                         seed=2104221520
                         )
tibble::tibble(state=names(clRules$objCluster), cluster=factor(unname(clRules$objCluster))) %>%
    usmap::plot_usmap(regions="states", values="cluster", data=.) + 
    labs(title="Rules-based clusters using CDC daily data") + 
    scale_fill_discrete("Cluster")

# Example for kmeans clustering (elbow plot)
# Return 7 segments
clKMeans <- clusterStates(df=cdcPerCapita, 
                          hierarchical=FALSE, 
                          returnList=TRUE, 
                          shapeFunc=customTimeBucket, 
                          minShape="2020-04", 
                          maxShape="2021-03", 
                          nCenters=7,
                          iter.max=50,
                          nstart=25,
                          testCenters=1:20,
                          seed=2104221530
                          )

## 
## Cluster means and counts
##                   1    2    3    4    5     6     7
## .              4.00 8.00 2.00 7.00 5.00 14.00 11.00
## totalCases     2.58 1.87 2.03 1.43 0.70  2.06  1.93
## totalDeaths    1.90 1.93 0.46 1.11 0.42  1.48  1.06
## cases_2020-04  0.02 0.07 0.01 0.04 0.02  0.02  0.01
## deaths_2020-04 0.04 0.19 0.02 0.11 0.09  0.04  0.03
## cases_2020-05  0.02 0.04 0.01 0.04 0.01  0.02  0.02
## deaths_2020-05 0.07 0.13 0.02 0.12 0.03  0.05  0.04
## cases_2020-06  0.03 0.02 0.02 0.02 0.02  0.03  0.02
## deaths_2020-06 0.03 0.05 0.02 0.06 0.02  0.04  0.03
## cases_2020-07  0.04 0.05 0.04 0.03 0.04  0.07  0.05
## deaths_2020-07 0.04 0.04 0.05 0.03 0.03  0.05  0.03
## cases_2020-08  0.04 0.04 0.03 0.03 0.07  0.06  0.04
## deaths_2020-08 0.04 0.04 0.05 0.03 0.05  0.06  0.04
## cases_2020-09  0.05 0.03 0.05 0.03 0.05  0.05  0.05
## deaths_2020-09 0.04 0.03 0.04 0.03 0.05  0.05  0.05
## cases_2020-10  0.13 0.05 0.12 0.06 0.05  0.07  0.11
## deaths_2020-10 0.09 0.03 0.08 0.03 0.06  0.06  0.07
## cases_2020-11  0.23 0.12 0.24 0.16 0.13  0.17  0.23
## deaths_2020-11 0.17 0.05 0.13 0.07 0.07  0.10  0.13
## cases_2020-12  0.19 0.21 0.22 0.21 0.20  0.21  0.21
## deaths_2020-12 0.21 0.12 0.23 0.18 0.19  0.18  0.21
## cases_2021-01  0.15 0.21 0.15 0.19 0.21  0.19  0.16
## deaths_2021-01 0.16 0.16 0.18 0.17 0.23  0.19  0.18
## cases_2021-02  0.05 0.09 0.06 0.08 0.10  0.07  0.06
## deaths_2021-02 0.08 0.10 0.11 0.10 0.12  0.12  0.11
## cases_2021-03  0.05 0.08 0.05 0.09 0.11  0.04  0.04
## deaths_2021-03 0.03 0.05 0.08 0.06 0.07  0.05  0.07
tibble::tibble(state=names(clKMeans$objCluster$cluster), 
               cluster=factor(unname(clKMeans$objCluster$cluster))
               ) %>%
    usmap::plot_usmap(regions="states", values="cluster", data=.) + 
    labs(title="K-means clusters using CDC daily data") + 
    scale_fill_discrete("Cluster")

# Example for hierarchical clustering (clusters)
clHier <- clusterStates(cdcPerCapita, 
                        hierarchical=TRUE, 
                        returnList=FALSE, 
                        shapeFunc=customTimeBucket, 
                        minShape="2020-04", 
                        maxShape="2021-01"
                        )

tibble::tibble(state=names(cutree(clHier, k=7)), 
               cluster=factor(unname(cutree(clHier, k=7)))
               ) %>%
    usmap::plot_usmap(regions="states", values="cluster", data=.) + 
    labs(title="Hierarchical clusters using CDC daily data") + 
    scale_fill_discrete("Cluster")

There are some meaningful differences in segment membership depending on method, suggestive that there may have been some convergence of outcomes across states. Generally, the heavy/hard cluster centered around NY and the light/late cluster centered in the NE/NW tend to appear in each approach. The rules-based segments look reasonably similar to those generated previously.

Next steps are to generate descriptive plots for the segments. The code for assessClusters() is copied and adapted:

# # STEP 6: Create the cluster assessments
# plotData <- assessClusters(useClusters, 
#                            dfState=stateData, 
#                            dfBurden=dfPerCapita,
#                            thruLabel=thruLabel,
#                            plotsTogether=TRUE
#                            )


# Function to reorder and relabel factors
changeOrderLabel <- function(df, 
                             fctVar="cluster",
                             grpVars=c(),
                             burdenVar="dpm", 
                             wgtVar="pop",
                             exclfct="999"
                             ) {
    
    # FUNCTION ARGUMENTS
    # df: the data frame
    # fctVar: the factor variable
    # grpVars: the variable that the data are aurrently at (e.g., "fipsCounty" for county-level in df)
    # burdenVar: the disease burden variable for sorting
    # wgtVar: the weight variable for sorting
    # exclfct: the factor level to be excluded from analysis
    
    # General approach
    # 1. Data are aggregated to c(fctVar, grpVars) as x=sum(burdenVar*wgtVar) and y=mean(wgtVar)
    # 2. Data are aggregated to fctVar as z=sum(x)/sum(y)
    # 3. Factors are reordered from high to low on z, with the excluded factor added back last (if it exists)
    
    # Check if exclfct exists in the factor variable
    fctDummy <- exclfct %in% levels(df[, fctVar, drop=TRUE])
    
    # Create the summary of impact by segment
    newLevels <- df %>%
        filter(get(fctVar) != exclfct) %>%
        group_by_at(vars(all_of(c(fctVar, grpVars)))) %>%
        summarize(x=sum(get(burdenVar)*get(wgtVar)), y=mean(get(wgtVar)), .groups="drop") %>%
        group_by_at(vars(all_of(fctVar))) %>%
        summarize(z=sum(x)/sum(y), .groups="drop") %>%
        arrange(-z) %>%
        pull(fctVar) %>%
        as.character()
    
    # Add back the dummy factor at the end (if it exists)
    if (fctDummy) newLevels <- c(newLevels, exclfct)
    
    # Reassign the levels in df
    df %>% 
        mutate(!!fctVar:=factor(get(fctVar), levels=newLevels, labels=newLevels))
    
}


# Helper function to make the overall cluster summary statistics
helperClusterSummaryPlots <- function(dfState, 
                                      dfPlot,
                                      showMap, 
                                      clusterPlotsTogether,
                                      weightedMean=TRUE,
                                      mapLevel="states", 
                                      p3Vars=c("cases"="cpm7", "deaths"="dpm7"),
                                      p4Vars=c("cases"="cases", "deaths"="deaths"), 
                                      p4Fun=sum
                                      ) {
    
    # FUNCTION ARGUMENTS:
    # dfState: contains the state/county-level data
    # dfPlot: contains the joined data for plotting
    # showMap: boolean for whether to create a map (if FALSE, segment membership counts are shown instead)
    # clusterPlotsTogether: boolean, whether to put all four plots on the same page
    # weightedMean: boolean, whether to create weighted mean by segment (if FALSE, median by segment is taken)
    # mapLevel: the level to be used on the map
    # p3Vars: the variables to be included in plot 3 (character vector of length 2, plotName=variableName)
    # p4Vars: the variables to be included in plot 4 (character vector of length 2, plotName=variableName)
    # p4Fun: the function to be applied in plot 4 (sum for sum of daily, max for max of cumulative)
    
    # Reorder the cluster levels in dfState to match dfPlot
    # Convert factor order to match dfPlot (can be reordered by argument to the calling function)
    dfState <- dfState %>%
        mutate(cluster=factor(cluster, levels=levels(dfPlot$cluster)))
    
    # Plot the segments on a map or show segment membership
    if (showMap) {
        if (mapLevel=="counties") {
            dfMap <- dfState %>%
                mutate(fips=stringr::str_pad(state, width=5, side="left", pad="0")) %>%
                select(fips, cluster)
        } else {
            dfMap <- dfState
        }
        # Create the map
        p1 <- usmap::plot_usmap(regions=mapLevel, data=dfMap, values="cluster") + 
            scale_fill_discrete("cluster") + 
            theme(legend.position="right")
    } else {
        p1 <- dfState %>%
            count(cluster) %>%
            ggplot(aes(x=fct_rev(cluster), y=n)) + 
            geom_col(aes(fill=cluster)) +
            geom_text(aes(y=n/2, label=n)) +
            coord_flip() + 
            labs(x="", y="# Counties", title="Membership by segment")
    }
    
    # Plot the population totals by segment
    # Show population totals by cluster
    p2 <- dfState %>%
        group_by(cluster) %>%
        summarize(pop=sum(pop)/1000000, .groups="drop") %>%
        ggplot(aes(x=fct_rev(cluster), y=pop)) + 
        geom_col(aes(fill=cluster)) + 
        geom_text(aes(y=pop/2, label=round(pop))) + 
        labs(y="Population (millions)", x="Cluster", title="Population by cluster (millions)") +
        coord_flip()
    
    # Plot the rolling 7-day mean daily disease burden by cluster
    # Create the p3Data to be either median of all elements in cluster or weighted mean
    p3 <- dfPlot %>%        
        select_at(vars(all_of(c("date", "cluster", unname(p3Vars), "pop")))) %>%
        purrr::set_names(c("date", "cluster", names(p3Vars), "pop")) %>%
        pivot_longer((-c(date, cluster, pop))) %>%
        filter(!is.na(value)) %>%
        group_by(date, cluster, name) %>%
        summarize(mdnValue=median(value), wtdValue=sum(pop*value)/sum(pop), .groups="drop") %>%
        ggplot(aes(x=date, y=if(weightedMean) wtdValue else mdnValue)) +
        geom_line(aes(group=cluster, color=cluster)) +
        facet_wrap(~name, scales="free_y") +
        labs(x="",
             y="Rolling 7-day mean, per million",
             title="Rolling 7-day mean daily disease burden, per million",
             subtitle=paste0(if(weightedMean) "Weighted mean" else "Median", 
                             " per day for states assigned to cluster"
             )
        ) + 
        scale_x_date(date_breaks="1 months", date_labels="%b")
    
    # Plot the total cases and total deaths by cluster
    p4 <- dfPlot %>%
        select_at(vars(all_of(c("cluster", "date", unname(p4Vars))))) %>%
        purrr::set_names(c("cluster", "date", names(p4Vars))) %>%
        group_by(cluster, date) %>%
        summarize_all(.funs=sum) %>% # Get the total by cluster-date so that it can then be summed/maxed
        group_by(cluster) %>%
        summarize_at(vars(names(p4Vars)), .funs=p4Fun) %>%
        ungroup() %>%
        pivot_longer(-cluster) %>%
        ggplot(aes(x=fct_rev(cluster), y=value/1000)) + 
        geom_col(aes(fill=cluster)) + 
        geom_text(aes(y=value/2000, label=round(value/1000))) +
        coord_flip() + 
        facet_wrap(~varMapper[name], scales="free_x") + 
        labs(x="Cluster", y="Burden (000s)", title="Total burden by segment")
    
    # Place the plots together if plotsTogether is TRUE, otherwise just print
    if (isTRUE(clusterPlotsTogether)) {
        gridExtra::grid.arrange(p1, p2, p3, p4, nrow=2, ncol=2)
    } else {
        print(p1); print(p2); print(p3); print(p4)
    }
    
}


# Function to create side-by-side plots for a deaths and cases metric
# Data in df will be aggregated to be unique by byVar using aggFunc
helperBarDeathsCases <- function(df, 
                                 numVars,
                                 title="",
                                 xVar="state",
                                 fillVar=NULL,
                                 aggFunc=sum, 
                                 mapper=varMapper
                                 ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame containing the data
    # numVars: the relevant numeric variables for plotting
    # title: plot title, default is nothing
    # xVar: the x-axis variable for plotting
    # fillVar: the variable that will color the bars in the final plot (NULL means use "lightblue" for all)
    # aggFunc: the aggregate function (will be applied to create data unique by byVar)
    # mapper: mapping file to convert x/y variables to descriptive axes (named vector "variable"="label")
    
    # OVERALL FUNCTION PROCESS:
    # 1.  Variables in numVar are aggregated by aggFunc to be unique by c(xVar, fillVar)
    # 2.  Data are pivoted longer
    # 3.  Bar charts are created, with coloring by fillVar if provided
    
    # Create the byVar for summing
    byVar <- xVar
    if (!is.null(fillVar)) { byVar <- c(byVar, fillVar) }
    
    # Process the data and create the plot
    p1 <- df %>%
        select_at(vars(all_of(c(byVar, numVars)))) %>%
        group_by_at(vars(all_of(byVar))) %>%
        summarize_all(aggFunc) %>%
        pivot_longer(-all_of(byVar)) %>%
        ggplot(aes(x=fct_reorder(get(xVar), value, .fun=min), y=value)) + 
        coord_flip() + 
        facet_wrap(~mapper[name], scales="free_x") + 
        labs(x="", y="", title=title) + 
        if (is.null(fillVar)) geom_col(fill="lightblue") else geom_col(aes_string(fill=fillVar))
    
    # Print the plot
    print(p1)
    
}


# Helper function to make total and per capita by state (calls its own helper function)
helperCallTotalPerCapita <- function(dfPlot, 
                                     thruLabel, 
                                     isCDC=FALSE
                                     ) {
    
    # FUNCTION ARGUMENTS:
    # dfPlot: the plotting data frame
    # thruLabel: the date that data are through
    # isCDC: boolean, are the data from CDC daily rather than CTP?
    
    # Plot total cases and total deaths by state, colored by cluster
    helperBarDeathsCases(dfPlot, 
                         numVars=if(isTRUE(isCDC)) c("tot_cases", "tot_deaths") else c("cases", "deaths"), 
                         title=paste0("Coronavirus impact by state through ", thruLabel), 
                         xVar=c("state"), 
                         fillVar=c("cluster"), 
                         aggFunc=if(isTRUE(isCDC)) max else sum
                         )
    
    # Plot cases per million and deaths per million by state, colored by cluster
    helperBarDeathsCases(dfPlot, 
                         numVars=if(isTRUE(isCDC)) c("tcpm", "tdpm") else c("cpm", "dpm"), 
                         title=paste0("Coronavirus impact by state through ", thruLabel), 
                         xVar=c("state"), 
                         fillVar=c("cluster"),
                         aggFunc=if(isTRUE(isCDC)) max else sum
                         )
    
}


# Helper function to assess 30-day change vs. total
helperRecentvsTotal <- function(df, 
                                xVar, 
                                yVar,
                                title,
                                recencyDays=30, 
                                ablineSlope=0.5, 
                                mapper=varMapper, 
                                labelPlot=TRUE,
                                printPlot=TRUE, 
                                isCDC=FALSE
                                ) {
    
    # FUNCTION ARGUMENTS:
    # df: the tibble containing data by state by day
    # xVar: the x-variable
    # yVar: the y-variable
    # title: the plot title
    # recencyDays: number of days to consider as recent
    # ablineSlope: dashed line will be drawn with this slope and intercept 0
    # mapper: mapping file to convert x/y variables to descriptive axes (named vector "variable"="label")
    # labelPlot: boolean, whether to show the labels for each point
    # printPlot: boolean, whether to display the plot (if FALSE, plot object is returned)
    # isCDC: boolean, are the data from CDC daily rather than CTP?
    
    # Get the most date cutoff
    dateCutoff <- df %>% pull(date) %>% max() - recencyDays + 1
    cat("\nRecency is defined as", format(dateCutoff, "%Y-%m-%d"), "through current\n")
    
    # For CDC data, set tcpm and tdpm to 0 except for the last date
    if(isTRUE(isCDC)) {
        df <- df %>%
            group_by(state) %>%
            mutate(tcpm=ifelse(date==max(date), tcpm, 0), tdpm=ifelse(date==max(date), tdpm, 0)) %>%
            ungroup()
    }
    
    # Create the plot
    p1 <- df %>%
        mutate(newCases=ifelse(date >= dateCutoff, if(isTRUE(isCDC)) new_cases else cases, 0), 
               newDeaths=ifelse(date >= dateCutoff, if (isTRUE(isCDC)) new_deaths else deaths, 0), 
               newcpm=ifelse(date >= dateCutoff, cpm, 0), 
               newdpm=ifelse(date >= dateCutoff, dpm, 0)
        ) %>%
        group_by(state, cluster) %>%
        summarize_if(is.numeric, .funs=sum) %>%
        ungroup() %>%
        ggplot(aes_string(x=xVar, y=yVar)) + 
        labs(x=mapper[xVar], 
             y=mapper[yVar], 
             title=title, 
             subtitle=paste0("Dashed line represents ", 
                             round(100*ablineSlope), 
                             "% of total is new in last ", 
                             recencyDays,
                             " days"
             )
        ) + 
        geom_abline(lty=2, slope=ablineSlope) + 
        lims(x=c(0, NA), y=c(0, NA)) + 
        theme(legend.position = "bottom")
    
    # Add the appropriate geom (scatterplot if labelPlot is FALSE)
    if (labelPlot) p1 <- p1 + geom_text(aes(color=cluster, label=state))
    else p1 <- p1 + geom_point(aes(color=cluster), alpha=0.5)
    
    if (isTRUE(printPlot)) {
        print(p1)
    } else {
        p1
    }
    
}


# Helper function to make recent vs. total plots
helperCallRecentvsTotal <- function(dfPlot, 
                                    thruLabel, 
                                    labelPoints, 
                                    recentTotalTogether, 
                                    isCDC=FALSE
                                    ) {
    
    # FUNCTION ARGUMENTS:
    # dfPlot: the plotting data frame
    # thruLabel: the date that data are through
    # labelPoints: boolean, whether to label the individual points
    # recentTotalTogether: boolean, whether to put these plots together on one page
    # isCDC: boolean, are the data from CDC daily rather than CTP?
    
    # Plot last-30 vs total for cases per million by state, colored by cluster
    p7 <- helperRecentvsTotal(dfPlot, 
                              xVar=if(isTRUE(isCDC)) "tcpm" else "cpm", 
                              yVar="newcpm", 
                              title=paste0("Coronavirus burden through ", thruLabel), 
                              labelPlot=labelPoints,
                              printPlot=FALSE, 
                              isCDC=isCDC
    )
    
    # Plot last-30 vs total for deaths per million by state, colored by cluster
    p8 <- helperRecentvsTotal(dfPlot, 
                              xVar=if(isTRUE(isCDC)) "tdpm" else "dpm", 
                              yVar="newdpm", 
                              title=paste0("Coronavirus burden through ", thruLabel), 
                              labelPlot=labelPoints,
                              printPlot=FALSE,
                              isCDC=isCDC
    )
    
    # Print the plots either as a single page or separately
    if (isTRUE(recentTotalTogether)) {
        gridExtra::grid.arrange(p7, p8, nrow=1)
    } else {
        print(p7); print(p8)
    }    
    
}


# Function to plot cluster vs. individual elements on a key metric
helperTotalvsElements <- function(df, 
                                  keyVar, 
                                  title,
                                  aggAndTotal=TRUE,
                                  pctRibbon=0.8,
                                  aggFunc=if(aggAndTotal) median else mean, 
                                  mapper=varMapper, 
                                  facetScales="free_y", 
                                  printPlot=TRUE
                                  ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame containing n-day rolling averages
    # keyVar: the variable to be plotted
    # title: the plot title
    # aggAndTotal: boolean, whether to plot every individual observation along with the cluster aggregate
    # pctRibbon: if aggAndTotal is FALSE, a ribbon covering this percentage of the data will be plotted
    # aggFunc: how to aggregate the elements to the segment
    #          CAUTION that this is an aggregate of averages, rather than a population-weighted aggregate
    # mapper: the variable mapping file to get the appropriate label for keyVar
    # facetScales: the scaling for the facets - "free_y" to let them all float, "fixed" to have them the same
    # printPlot: boolean, if TRUE print the plot (otherwise return the plot object)
    
    # Create an appropriate subtitle
    subtitle <- if(facetScales=="free_y") {
        "Caution that each facet has its own y axis with different scales"
    } else if (facetScales=="fixed") { 
        "All facets are on the same scale"
    } else {
        "Update subtitle code in function helperTotalvsElements"
    }
    
    # Create an appropriate caption
    xCap <- "1. Cluster aggregate is mean over all observations (NOT population-weighted)"
    xCap <- paste0(xCap, "\n2. Ribbons reflect range covering ", round(pctRibbon*100), "% of observations")
    caption <- if(aggAndTotal) {
        "Cluster aggregate is median, weighting each observation equally\n(NOT population-weighted)"
    } else {
        xCap
    }
    
    # Create the plots for segment-level data
    p1 <- df %>%
        rbind(mutate(., state="cluster")) %>%
        group_by(state, cluster, date) %>%
        summarize_at(vars(all_of(keyVar)), .funs=aggFunc) %>%
        ungroup() %>%
        filter(!is.na(get(keyVar))) %>%
        ggplot(aes_string(x="date")) + 
        geom_line(data=~filter(., state == "cluster"), 
                  aes(y=get(keyVar), group=state, color=cluster), 
                  lwd=1.5
        ) + 
        facet_wrap(~cluster, scales=facetScales) + 
        labs(x="", 
             y=mapper[keyVar], 
             title=title, 
             subtitle=subtitle,
             caption=caption
        ) + 
        ylim(c(0, NA)) + 
        theme(legend.position="bottom")
    
    # Add an appropriate individual metric (either every observation or quantiles)
    if (aggAndTotal) {
        p1 <- p1 + 
            geom_line(data=~filter(., state != "cluster"), 
                      aes(y=get(keyVar), group=state), 
                      alpha=0.25
            )
    } else {
        dfRibbon <- df %>%
            filter(!is.na(get(keyVar))) %>%
            group_by(cluster, date) %>%
            summarize(ylow=quantile(get(keyVar), 0.5-0.5*pctRibbon), 
                      yhigh=quantile(get(keyVar), 0.5+0.5*pctRibbon), 
                      .groups="drop"
            )
        p1 <- p1 + 
            geom_ribbon(data=dfRibbon, 
                        aes(ymin=ylow, ymax=yhigh), 
                        alpha=0.25
            )
    }
    
    # Print plot if requested, otherwise return it
    if (isTRUE(printPlot)) {
        print(p1)
    } else {
        p1
    }
    
}


# Helper function to create total vs. elements plots
helperCallTotalvsElements <- function(dfPlot, 
                                      aggAndTotal, 
                                      clusterAggTogether,
                                      ...
                                      ) {
    
    # FUNCTION ARGUMENTS:
    # dfPlot: the plotting data frame
    # aggAndTotal: boolean, should each individual line be plotted (if FALSE an 80% CI is plotted instead)
    # clusterAggTogether: boolean, whether to print the plots all on a single page
    # ...: any other arguments to pass to helperTotalvsElements (especially pctRibbon or aggFunc)
    
    # Plot the cases per million on a free y-scale and a fixed y-scale
    p9 <- helperTotalvsElements(dfPlot, 
                                keyVar="cpm7", 
                                aggAndTotal=aggAndTotal,
                                title="Cases per million, 7-day rolling mean", 
                                printPlot=FALSE, 
                                ...
    )
    p10 <- helperTotalvsElements(dfPlot, 
                                 keyVar="cpm7", 
                                 aggAndTotal=aggAndTotal,
                                 title="Cases per million, 7-day rolling mean", 
                                 facetScales="fixed", 
                                 printPlot=FALSE, 
                                 ...
    )
    
    # Plot the deaths per million on a free y-scale and a fixed y-scale
    p11 <- helperTotalvsElements(dfPlot, 
                                 keyVar="dpm7", 
                                 aggAndTotal=aggAndTotal,
                                 title="Deaths per million, 7-day rolling mean", 
                                 printPlot=FALSE, 
                                 ...
    )
    p12 <- helperTotalvsElements(dfPlot, 
                                 keyVar="dpm7", 
                                 aggAndTotal=aggAndTotal,
                                 title="Deaths per million, 7-day rolling mean", 
                                 facetScales="fixed", 
                                 printPlot=FALSE, 
                                 ...
    )
    
    if (isTRUE(clusterAggTogether)) {
        gridExtra::grid.arrange(p9, p11, nrow=1)
        gridExtra::grid.arrange(p10, p12, nrow=1)
    } else {
        print(p9); print(p10); print(p11); print(p12)
    }
    
}


# assessClusters() function
assessClusters <- function(clusters, 
                           dfState=stateData, 
                           dfBurden=cvFilteredPerCapita,
                           thruLabel="Aug 20, 2020",
                           isCounty=FALSE,
                           plotsTogether=FALSE, 
                           clusterPlotsTogether=plotsTogether,
                           recentTotalTogether=plotsTogether, 
                           clusterAggTogether=plotsTogether, 
                           makeSummaryPlots=TRUE, 
                           makeTotalvsPerCapitaPlots=!isCounty, 
                           makeRecentvsTotalPlots=TRUE, 
                           makeTotalvsElementPlots=TRUE, 
                           showMap=!isCounty, 
                           orderCluster=FALSE, 
                           isCDC=FALSE,
                           p3Vars=c("cases"="cpm7", "deaths"="dpm7"),
                           p4Vars=c("cases"="cases", "deaths"="deaths"), 
                           p4Fun=sum
                           ) {
    
    # FUNCTION ARGUMENTS:
    # clusters: the named vector containing the clusters by state
    # dfState: the file containing the states and populations
    # dfBurden: the data containing the relevant per capita burden statistics by state-date
    # thruLabel: label for plots for 'data through'
    # isCounty: boolean, is this a plot of county-level data that have been named 'state'?
    #           FALSE means that it is normal state-level data
    # plotsTogether: boolean, should plots be consolidated on fewer pages?
    # clusterPlotsTogether: boolean, should plots p1-p4 be consolidated?
    # recentTotalTogether: boolean, should recent total plots p7-p8 be consolidated?
    # clusterAggTogether: boolean, should aggregate plots p9/p11 and p10/p12 be consolidated?
    # makeSummaryPlots: boolean, should the summary plots be made?
    # makeTotalvsPerCapitaPlots: boolean, should the total and per capita plots be produced?
    # makeRecentvsTotalPlots: boolean, should the recent vs. total plots be created?
    # makeTotalvsElementPlots: boolean, should the total vs. element plots be created?
    # showMap: boolean, whether to create a map colored by cluster (will show segment counts otherwise)
    # orderCluster: if FALSE, ignore; if TRUE, order by "dpm"; if anything else, order by orderCluster
    # isCDC: boolean, are the data from CDC daily rather than CTP?
    # p3Vars: the variables to be included in plot 3 (character vector of length 2, plotName=variableName)
    # p4Vars: the variables to be included in plot 4 (character vector of length 2, plotName=variableName)
    # p4Fun: the function to be applied in plot 4 (sum for sum of daily, max for max of cumulative)
    # ...: any additional arguments passed from a calling function
    #      most common would be orderCluster=TRUE, a request for the clusters to be ordered by disease burden
    
    # Attach the clusters to the state population data
    dfState <- as.data.frame(clusters) %>%
        set_names("cluster") %>%
        rownames_to_column("state") %>%
        inner_join(dfState, by="state") %>%
        mutate(cluster=factor(cluster))
    
    # Plot the rolling 7-day mean dialy disease burden by cluster
    dfPlot <- dfState %>%
        inner_join(dfBurden, by="state") %>%
        tibble::as_tibble()
    
    # Reorder the clusters if requested
    if (!isFALSE(orderCluster)) {
        if (isTRUE(orderCluster)) burdenParam <- "dpm" else burdenParam <- orderCluster
        dfPlot <- changeOrderLabel(dfPlot, grpVars="state", burdenVar=burdenParam)
    }
    
    # Call the helper function to make the overall summary statistics
    if (makeSummaryPlots) {
        helperClusterSummaryPlots(dfState=dfState, 
                                  dfPlot=dfPlot, 
                                  showMap=showMap, 
                                  clusterPlotsTogether=clusterPlotsTogether, 
                                  mapLevel=if(isCounty) "counties" else "states", 
                                  p3Vars=p3Vars, 
                                  p4Vars=p4Vars, 
                                  p4Fun=p4Fun
                                  )
    }
    
    # These are primarily useful for state-level data and default to FALSE when isCounty is TRUE
    if (makeTotalvsPerCapitaPlots) {
        helperCallTotalPerCapita(dfPlot=dfPlot, thruLabel=thruLabel, isCDC=isCDC)
    }
    
    # Call the helper function to make recent vs. total plots
    if (makeRecentvsTotalPlots) {
        helperCallRecentvsTotal(dfPlot=dfPlot, 
                                thruLabel=thruLabel, 
                                labelPoints=!isCounty, 
                                recentTotalTogether=recentTotalTogether, 
                                isCDC=isCDC
                                )
    }
    
    # Call the total vs. elements helper function
    if (makeTotalvsElementPlots) {
        helperCallTotalvsElements(dfPlot=dfPlot, 
                                  aggAndTotal=!isCounty, 
                                  clusterAggTogether=clusterPlotsTogether
        )
    }
    
    # Return the plotting data frame
    dfPlot
    
}

The functions is then tested against the CDC daily data:

cdcPlotData <- assessClusters(clusters=clRules$objCluster,
                              dfState=stateDataCDC,
                              dfBurden=cdcPerCapita,
                              thruLabel="Apr 12, 2021",
                              plotsTogether=TRUE, 
                              orderCluster="dpm", 
                              p4Vars=c("cases"="tot_cases", "deaths"="tot_deaths"), 
                              p4Fun=max, 
                              isCDC=TRUE
                              )

## 
## Recency is defined as 2021-03-14 through current
## 
## Recency is defined as 2021-03-14 through current

cdcPlotData
## # A tibble: 22,797 x 17
##    state cluster name     pop date       new_cases new_deaths tot_cases
##    <chr> <fct>   <chr>  <dbl> <date>         <dbl>      <dbl>     <dbl>
##  1 AK    4       Alas~ 731545 2020-01-22         0          0         0
##  2 AK    4       Alas~ 731545 2020-01-23         0          0         0
##  3 AK    4       Alas~ 731545 2020-01-24         0          0         0
##  4 AK    4       Alas~ 731545 2020-01-25         0          0         0
##  5 AK    4       Alas~ 731545 2020-01-26         0          0         0
##  6 AK    4       Alas~ 731545 2020-01-27         0          0         0
##  7 AK    4       Alas~ 731545 2020-01-28         0          0         0
##  8 AK    4       Alas~ 731545 2020-01-29         0          0         0
##  9 AK    4       Alas~ 731545 2020-01-30         0          0         0
## 10 AK    4       Alas~ 731545 2020-01-31         0          0         0
## # ... with 22,787 more rows, and 9 more variables: tot_deaths <dbl>, cpm <dbl>,
## #   dpm <dbl>, tcpm <dbl>, tdpm <dbl>, cpm7 <dbl>, dpm7 <dbl>, tcpm7 <dbl>,
## #   tdpm7 <dbl>

Next steps are to adapt the code for plotConsolidatedMetrics() for use with CDC daily data:

# STEP 7: Plot the consolidated metrics
# subT <- "Cases: new cases, Deaths: new deaths, Hosp: total in hospital (not new), Tests: new tests"
# consolidatedPlotData <- plotConsolidatedMetrics(plotData, 
#                                                 varMain=c("state", "cluster", "date", "pop",
#                                                           "cases", "deaths", "hosp", "tests"
#                                                           ), 
#                                                 subT=subT, 
#                                                 nrowPlot2=2
#                                                 )

# Function to create plots of consolidated metrics
plotConsolidatedMetrics <- function(dfMain, 
                                    dfHosp=NULL, 
                                    varMain=c("state", "cluster", "date", "pop", "cases", "deaths", "hosp"),
                                    varDropHosp=c("n", "pop"), 
                                    joinBy=c("state", "cluster", "date"), 
                                    subT=NULL, 
                                    nrowPlot2=1
                                    ) {
    
    # FUNCTION ARGUMENTS:
    # dfMain: the main file produced by assessClusters(), containing data by state-cluster-date
    # dfHosp: the separate file with hospital data (NULL means data already available in dfMain)
    # varMain: variables to keep from dfMain
    # varDropHosp: variables to drop from dfHosp
    # joinBy: variables for joining dfMain and dfHosp
    # subT: plot subtitle (NULL will use the defaults), 
    # nrowPlot2: number of rows for the facetting to use on plot 2
    
    if (is.null(subT)) {
        subT <- "Cases: new cases, Deaths: new deaths, Hosp: total in hospital (not new)"
    }
    
    # Filter dfMain to include only variables in varMain
    dfMain <- dfMain %>%
        select_at(vars(all_of(varMain)))
    
    # Left join dfMain to dfHosp unless dfHosp is NULL
    if (!is.null(dfHosp)) {
        dfHosp <- dfHosp %>%
            select_at(vars(all_of(names(dfHosp)[!(names(dfHosp) %in% varDropHosp)])))
        dfMain <- dfMain %>%
            left_join(dfHosp, by=all_of(joinBy))
    }
    
    # Check that variables state, cluster, date, pop are all available
    if (!(c("state", "cluster", "date", "pop") %in% names(dfMain) %>% all())) {
        stop("\nFunction requires variables state, cluster, date, and pop after processing\n")
    }
    
    # Create the relevant plotting data
    dfPlot <- dfMain %>%
        pivot_longer(-c(state, cluster, date, pop)) %>%
        filter(!is.na(value)) %>%
        rbind(mutate(., state="cluster")) %>%
        group_by_at(vars(all_of(c(joinBy, "name")))) %>%
        summarize(value=sum(value), pop=sum(pop), .groups="drop") %>%
        mutate(vpm=1000000*value/pop) %>%
        arrange(state, cluster, name, date) %>%
        group_by(state, cluster, name) %>%
        helperRollingAgg(origVar="vpm", newName="vpm7")    
    
    # Create facetted plots for totals by metric by segment
    p1 <- dfPlot %>%
        filter(!is.na(vpm7)) %>%
        ggplot(aes(x=date, y=vpm7)) + 
        geom_line(data=~filter(., state=="cluster"), aes(group=cluster, color=cluster), lwd=1.5) +
        geom_line(data=~filter(., state!="cluster"), aes(group=state), alpha=0.25) + 
        facet_grid(name ~ cluster, scales="free_y") + 
        labs(x="", 
             y="Rolling 7-day mean per million", 
             title="Key metrics by cluster (7-day rolling mean per million)", 
             subtitle=subT
        ) + 
        scale_x_date(date_breaks="1 months", date_labels="%b") + 
        theme(axis.text.x=element_text(angle=90))
    print(p1)
    
    # Create all-segment plot by metric
    p2 <- dfPlot %>%
        filter(!is.na(vpm7)) %>%
        ggplot(aes(x=date, y=vpm7)) + 
        geom_line(data=~filter(., state=="cluster"), aes(group=cluster, color=cluster), lwd=1.5) +
        facet_wrap(~ name, scales="free_y", nrow=nrowPlot2) + 
        labs(x="", 
             y="Rolling 7-day mean per million", 
             title="Key metrics by cluster (7-day rolling mean per million)", 
             subtitle=subT
        ) + 
        scale_x_date(date_breaks="1 months", date_labels="%b") + 
        theme(axis.text.x=element_text(angle=90))
    print(p2)
    
    # Create all-metric plot by segment (define 100% as peak for segment-metric)
    p3 <- dfPlot %>%
        filter(!is.na(vpm7)) %>%
        group_by(state, cluster, name) %>%
        mutate(spm7=vpm7/max(vpm7)) %>%
        ggplot(aes(x=date, y=spm7)) + 
        geom_line(data=~filter(., state=="cluster"), aes(group=name, color=name), lwd=1) +
        facet_wrap(~ cluster, scales="free_y") + 
        labs(x="", 
             y="% of Maximum", 
             title="Key metrics by cluster (% of cluster maximum for variable)", 
             subtitle=subT
        ) + 
        scale_x_date(date_breaks="1 months", date_labels="%b") + 
        scale_color_discrete("variable") +
        theme(axis.text.x=element_text(angle=90))
    print(p3)
    
    # Return the plotting data
    dfPlot
    
}

The function appears OK as-is provided that appropriate parameters are passed:

subT <- "new_cases: new cases, new_deaths: new deaths"
subT <- paste0(subT, ", tot_cases: cumulative cases, tot_deaths: cumulative deaths")
consolidatedCDCPlotData <- plotConsolidatedMetrics(cdcPlotData,
                                                   varMain=c("state", "cluster", "date", "pop",
                                                             "new_cases", "new_deaths", 
                                                             "tot_cases", "tot_deaths"
                                                             ),
                                                   subT=subT,
                                                   nrowPlot2=2
                                                   )

Next steps are to adapt the code for making cumulative plots:

# # STEP 8: Create cumulative metrics if requested
# consPos <- consolidatedPlotData %>%
#     ungroup() %>%
#     select(state, cluster, date, name, vpm7) %>%
#     arrange(state, cluster, date, name) %>%
#     pivot_wider(-vpm7, names_from="name", values_from="vpm7") %>%
#     mutate(pctpos=cases/tests) %>%
#     pivot_longer(-c(state, cluster, date), values_to="vpm7") %>%
#     filter(!is.na(vpm7))
# 
# clCum <- makeCumulative(consPos)
# plotCumulativeData(clCum, 
#                    keyMetricp2="", 
#                    flagsp2="", 
#                    makep1=TRUE, 
#                    makep2=FALSE
#                    )
# plotCumulativeData(clCum, 
#                    keyMetricp2="deaths", 
#                    flagsp2=findFlagStates(clCum, keyMetricVal = "deaths")
#                    )
# plotCumulativeData(clCum, 
#                    keyMetricp2="cases", 
#                    flagsp2=findFlagStates(clCum, keyMetricVal = "cases")
#                    )
# plotCumulativeData(clCum, 
#                    keyMetricp2="tests", 
#                    flagsp2=findFlagStates(clCum, keyMetricVal = "tests")
#                    )


# Function to convert a file to cumulative totals
# Function is already OK for CDC daily data (cum7 column will be meaningless in some cases, but code runs)
makeCumulative <- function(df, 
                           typeVar="name", 
                           typeKeep=c("cases", "deaths", "tests"), 
                           valVar="vpm7", 
                           groupVars=c("state", "cluster", "name"), 
                           arrangeVars=c("date"), 
                           newName="cum7"
                           ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame containing the metrics
    # typeVar: the variable holding the metric type (default is 'name')
    # typeKeep: the values of typeVar to be kept
    # valVar: the variable holding the metric value (default is 'vpm7')
    # groupVars: groups for calculating cumulative sum
    # arrangeVars: variables to be sorted by before calculating cumulative sum
    # newName: the name for the new variable
    
    # Create the cumulative data
    dfCum <- df %>%
        filter(get(typeVar) %in% typeKeep, !is.na(get(valVar))) %>%
        arrange_at(vars(all_of(c(groupVars, arrangeVars)))) %>%
        group_by_at(groupVars) %>%
        mutate(!!newName:=cumsum(get(valVar))) %>%
        ungroup()
    
    # Return the processed data
    dfCum
    
}


# Function to find and flag states that are high on a key value or change in key value
findFlagStates <- function(df, 
                           keyMetricVal, 
                           keyMetricVar="name", 
                           cumVar="cum7", 
                           prevDays=30, 
                           nFlag=5
                           ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame containing the cumulative data
    # keyMetricVal: the metric of interest (e.g., "deaths", "cases", "tests")
    # keyMetricVar: the variable name for the column containing the metric of interest
    # cumVar: variable containing the cumulative totals
    # prevDays: the number of days previous to use for defining growth
    # nFlag: the number of states to flag for either total or growth (top-n of each)
    
    # Find top-5 in either total or recent increase
    dfFlag <- df %>%
        filter(get(keyMetricVar)==keyMetricVal, state!="cluster") %>%
        select_at(vars(all_of(c("state", "date", cumVar)))) %>%
        group_by(state) %>%
        summarize(maxVal=max(get(cumVar)), 
                  tminus=sum(ifelse(date==max(date)-lubridate::days(prevDays), get(cumVar), 0)), 
                  .groups="drop"
        ) %>%
        ungroup() %>%
        mutate(growth=maxVal-tminus, 
               rkTotal=min_rank(-maxVal), 
               rkGrowth=min_rank(-growth), 
               flag=ifelse(pmin(rkTotal, rkGrowth)<=nFlag, 1, 0)
        ) %>%
        arrange(-flag, rkTotal)
    
    # Return the top values as a vector of states
    dfFlag %>%
        filter(flag==1) %>%
        pull(state)
    
}


# Function to plot cumulative data
# Will need to call using vpm7 or cum7 depending on metric
plotCumulativeData <- function(df, 
                               keyMetricp2,
                               flagsp2,
                               p2Desc=keyMetricp2,
                               keyVar="cum7", 
                               makep1=FALSE, 
                               makep2=TRUE, 
                               otherKeyVar="vpm7", 
                               namesOtherKeyVar=""
                               ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame of cumulative data
    # keyMetricp2: the key metric to be plotted in the second plot (e.g., "deaths", "cases", "tests")
    # flagsp2: states to be treated as flagged in the second plot
    # p2Desc: the description to be used in plot 2
    # keyVar: the key variable to plot
    # makep1: boolean, whether to make the first plot
    # makep2: boolean, whether to make the second plot
    # otherKeyVar: other key variable to be used (allows for vpm7 for some variables)
    # namesOtherKeyVar: values for 'name' that should use otherKeyVar
    
    # Plot the cumulative data by cluster (if flag is set for producing this)
    if (isTRUE(makep1)) {
        p1 <- df %>%
            filter(state=="cluster") %>%
            mutate(plotVar=ifelse(name %in% namesOtherKeyVar, get(otherKeyVar), get(keyVar))) %>%
            ggplot(aes(x=date, y=plotVar)) + 
            geom_line(aes(group=cluster, color=cluster)) + 
            facet_wrap(~name, nrow=1, scales="free_y") + 
            scale_x_date(date_breaks="1 months", date_labels="%m") + 
            labs(x="Month", 
                 y="Cumulative Burden (per million)", 
                 title="Cumulative burden by segment (per million)"
            )
        print(p1)
    }
    
    
    # Plot the cumulative totals over time for one metric, and flag the highest
    if (isTRUE(makep2)) {
        p2 <- df %>%
            filter(state!="cluster", name==keyMetricp2) %>%
            mutate(bold=ifelse(state %in% flagsp2, 1, 0)) %>%
            ggplot(aes(x=date, y=get(keyVar))) + 
            geom_line(aes(group=state, color=cluster, alpha=0.4+0.6*bold, size=0.5+0.5*bold)) + 
            geom_text(data=~filter(., bold==1, date==max(date)), 
                      aes(x=date+lubridate::days(10), 
                          label=paste0(state, ": ", round(get(keyVar), 0)), 
                          color=cluster
                      ), 
                      size=3, 
                      fontface="bold"
            ) +
            scale_x_date(date_breaks="1 months", date_labels="%m") + 
            scale_alpha_identity() +
            scale_size_identity() +
            labs(x="Month", 
                 y=paste0("Cumulative ", p2Desc, " (per million)"), 
                 title=paste0("Cumulative coronavirus ", p2Desc, " by state (per million)"), 
                 subtitle="Top 5 states for total and growth rate are bolded and labelled"
            )
        print(p2)
    }
    
}

The code and function are then run:

# Without tests data, this is not particulary useful (removes the NA from early/late vpm7)
consCDCPos <- consolidatedCDCPlotData %>%
    ungroup() %>%
    select(state, cluster, date, name, vpm7) %>%
    arrange(state, cluster, date, name) %>%
    pivot_wider(-vpm7, names_from="name", values_from="vpm7") %>%
    # mutate(pctpos=cases/tests) %>%  # deleted since there is no 'tests' field
    pivot_longer(-c(state, cluster, date), values_to="vpm7") %>%
    filter(!is.na(vpm7))

# Create a cumulative file keeping both new and total cases (cumulative for total will be meaningless)
clCDCCum <- makeCumulative(consCDCPos, 
                           typeKeep=c("new_cases", "new_deaths", "tot_cases", "tot_deaths")
                           )

# Cumulative plot of each variable
plotCumulativeData(clCDCCum,
                   keyMetricp2="",
                   flagsp2="",
                   makep1=TRUE,
                   makep2=FALSE, 
                   namesOtherKeyVar=c("tot_cases", "tot_deaths")
                   )

# Cumulative plot for new_deaths
plotCumulativeData(clCDCCum,
                   keyMetricp2="new_deaths",
                   flagsp2=findFlagStates(clCDCCum, keyMetricVal = "new_deaths")
                   )

# Cumulative pot for new_cases
plotCumulativeData(clCDCCum,
                   keyMetricp2="new_cases",
                   flagsp2=findFlagStates(clCDCCum, keyMetricVal = "new_cases")
                   )

Next steps are to write the full function using the updates made above.

The elements of the list are saved, for use in future functions:

cdcDaily_hier7_210414 <- list(stateData=stateDataCDC, 
                              dfRaw=cdcRaw, 
                              dfFiltered=cdcFiltered, 
                              dfPerCapita=cdcPerCapita, 
                              useClusters=clRules, 
                              plotData=cdcPlotData, 
                              consolidatedPlotData=consolidatedCDCPlotData, 
                              clCum=clCDCCum
                              )
saveToRDS(cdcDaily_hier7_210414, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/cdcDaily_hier7_210414.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL

The full function is copied and updated for use with CDC daily data:

# Function to read, convert, and sanity check a downloaded file
readCOViDbyState <- function(fileName, 
                             checkFile=NULL, 
                             controlFields=c("positiveIncrease", "deathIncrease", "hospitalizedCurrently"), 
                             controlBy=c("state"), 
                             dateChangePlot=FALSE, 
                             dateMetricPrint=TRUE, 
                             controlByMetricPrint=TRUE, 
                             writeLog=NULL, 
                             ovrwriteLog=TRUE, 
                             isCDCDaily=FALSE
                             ) {
    
    # FUNCTION ARGUMENTS:
    # fileName: the file name for reading the data
    # checkFile: a file that can be used for comparison purposes (NULL means do not compare to anything)
    # controlFields: fields that will be explicitly checked against checkFile
    # controlBy: level of aggregation at which fields will be explicitly checked against checkFile
    # dateChangePlot: boolean, should the change in dates included be plotte rather than listed?
    # dateMetricPrint: boolean, should the list of date-metric changes be printed?
    # controlByMetricPrint: boolean, should the list of controlBy-metric changes be printed?
    # writeLog: write detailed comparison to log file (NULL means do not write)
    # ovrwriteLog: boolean, should the log be started from scratch with the date comparisons?
    # isCDCDaily: boolean, are the data from CDC (default FALSE means use CTP)
    
    # Helper function to check for similarity of key elements
    helperSimilarity <- function(newData, refData, label, countOnly=FALSE, logFile=NULL, logAppend=TRUE) {
        d1 <- setdiff(refData, newData)
        d2 <- setdiff(newData, refData)
        cat("\n\nChecking for similarity of:", label)
        cat("\nIn reference but not in current:", if(countOnly) length(d1) else d1)
        cat("\nIn current but not in reference:", if(countOnly) length(d2) else d2)
        if (countOnly & !is.null(logFile)) {
            cat("\nDetailed differences available in:", logFile)
            capture.output(cat("\nIn reference but not in current:\n", paste(d1, collapse="\n"), sep=""), 
                           cat("\nIn current but not in reference:\n", paste(d2, collapse="\n"), sep=""), 
                           file=logFile, 
                           append=logAppend
                           )
        }
        if (countOnly) return(list(d1=d1, d2=d2))
    }
    
    # Read in the file and convert the date field for proper usage in remainder of process
    df <- readr::read_csv(fileName)
    if (isTRUE(isCDCDaily)) {
        df <- df %>%
            rename(date=submission_date) %>%
            mutate(date=lubridate::mdy(date))
    } else {
        df <- df %>% 
            mutate(date=lubridate::ymd(date))
    }
    
    # Check that the file is unique by date-state
    if ((df %>% select(date, state) %>% anyDuplicated()) != 0) {
        stop("\nDuplicates by date and state, investigate and fix\n")
    } else {
        cat("\nFile is unique by state and date\n")
    }
    
    # Check for overall control totals in new file
    cat("\n\nOverall control totals in file:\n")
    df %>% 
        summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>% 
        print()
    
    # Get control totals by date for new file
    dfByDate <- df %>% 
        group_by(date) %>%
        summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
        ungroup() %>%
        pivot_longer(-date, values_to="newValue")
    
    # If there is no checkFile, then just produce a plot of the key metrics
    if (is.null(checkFile)) {
        p1 <- dfByDate %>% 
            ggplot(aes(x=date, y=newValue)) + 
            geom_line() + 
            facet_wrap(~name, nrow=1, scales="free_y") + 
            labs(title="Control totals by date for new file (no reference file)", x="", y="Summed Value")
        print(p1)
    } else {
        # Check for similarity of fields, dates, and states
        cat("\n*** COMPARISONS TO REFERENCE FILE:", deparse(substitute(checkFile)))
        helperSimilarity(newData=names(df), refData=names(checkFile), label="column names")
        helperSimilarity(newData=df %>% pull(state) %>% unique(), 
                         refData=checkFile %>% pull(state) %>% unique(), 
                         label="states"
        )
        dateChangeList <- helperSimilarity(newData=df %>% 
                                               pull(date) %>% 
                                               unique() %>% 
                                               format("%Y-%m-%d") %>%
                                               sort(), 
                                           refData=checkFile %>% 
                                               pull(date) %>% 
                                               unique() %>% 
                                               format("%Y-%m-%d") %>%
                                               sort(),
                                           label="dates", 
                                           countOnly=dateChangePlot, 
                                           logFile=writeLog, 
                                           logAppend=!ovrwriteLog
                                           )
        
        # Plot date changes if requested
        if (dateChangePlot) {
            pDate <- tibble::tibble(date=as.Date(c(dateChangeList$d1, dateChangeList$d2)), 
                                    type=c(rep("Control File Only", length(dateChangeList$d1)), 
                                           rep("New File Only", length(dateChangeList$d2))
                                           )
                                    ) %>%
                ggplot(aes(x=date, fill=type)) + 
                geom_bar() + 
                coord_flip() + 
                labs(x="", y="", title="Dates in one file and not in the other")
            print(pDate)
        }
        
        # Check for similarity of control totals by date in files
        checkByDate <- checkFile %>% 
            group_by(date) %>%
            summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
            ungroup() %>%
            pivot_longer(-date, values_to="oldValue")
        deltaDate <- dfByDate %>% 
            inner_join(checkByDate, by=c("date", "name")) %>%
            filter(abs(newValue-oldValue)>=5, 
                   pmax(newValue, oldValue)>=1.01*pmin(newValue, oldValue)
            ) %>%
            as.data.frame()
        cat("\n\nDifference of 5+ that is at least 1% (summed to date and metric):", nrow(deltaDate))
        if (dateMetricPrint) {
            cat("\n")
            print(deltaDate)
        }
        else if (!is.null(writeLog)) {
            cat("\nDetailed output available in log:", writeLog)
            capture.output(cat("\n\nChange by date:\n"), print(deltaDate), file=writeLog, append=TRUE)
        }
        p1 <- dfByDate %>% 
            full_join(checkByDate, by=c("date", "name")) %>%
            pivot_longer(-c(date, name), names_to="newOld") %>%
            ggplot(aes(x=date, y=value, group=newOld, color=newOld)) + 
            geom_line() + 
            facet_wrap(~name, nrow=1, scales="free_y") + 
            labs(title="Control totals by date for new and reference file", x="", y="Summed Value")
        print(p1)
        
        # Check for similarity of control totals by controlBy in files
        dfByControl <- df %>% 
            semi_join(select(checkFile, date), by="date") %>%
            group_by_at(vars(all_of(controlBy))) %>%
            summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
            ungroup() %>%
            pivot_longer(-all_of(controlBy), values_to="newValue")
        checkByControl <- checkFile %>% 
            group_by_at(vars(all_of(controlBy))) %>%
            summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
            ungroup() %>%
            pivot_longer(-all_of(controlBy), values_to="oldValue")
        deltaBy <- dfByControl %>% 
            inner_join(checkByControl, by=c(controlBy, "name")) %>%
            filter(abs(newValue-oldValue)>=5, 
                   pmax(newValue, oldValue)>=1.01*pmin(newValue, oldValue)
            ) %>%
            as.data.frame()
        cat("\n\nDifference of 5+ that is at least 1% (summed to", 
            controlBy, 
            "and metric):", 
            nrow(deltaBy), 
            "\n"
            )
        if (controlByMetricPrint) print(deltaBy)
    }
    
    # Return the processed data file
    df
    
}


# Function to select relevant variables and observations, and report on control totals
processCVData <- function(dfFull, 
                          varsKeep=c("date", "state", "positiveIncrease", "deathIncrease"), 
                          varsRename=c("positiveIncrease"="cases", "deathIncrease"="deaths"), 
                          stateList=c(state.abb, "DC"), 
                          isCDCDaily=FALSE,
                          comboStates=if(isTRUE(isCDCDaily)) c("NYC"="NY", "NY"="NY") else c(),
                          sumBy=c("state", "date")
                          ) {
    
    # FUNCTION ARGUMENTS
    # dfFull: the full data file originally loaded
    # varsKeep: variables to keep from the full file
    # varsRename: variables to be renamed, using a named vector of form originalName=newName
    # stateList: variables for filtering state (NULL means do not run any filters)
    # isCDCDaily: boolean, are the daily data from CDC
    # comboStates: states to be consolidated ('name in file'='name to use')
    # sumBy: variables to sum by (after changing state names using comboStates)
    
    # Select only the key variables
    df <- dfFull %>%
        select_at(vars(all_of(varsKeep)))
    
    # Apply the renaming of variables
    names(df) <- ifelse(is.na(varsRename[names(df)]), names(df), varsRename[names(df)])
    
    # Apply state name changes and sum
    df <- df %>%
        mutate(state=ifelse(state %in% names(comboStates), unname(comboStates[state]), state)) %>%
        group_by_at(vars(all_of(sumBy))) %>%
        summarize_if(is.numeric, sum, na.rm=TRUE) %>%
        ungroup()
    
    # Designate each record as being either a valid state or not
    if (!is.null(stateList)) {
        df <- df %>%
            mutate(validState=state %in% stateList)
    } else {
        df <- df %>%
            mutate(validState=TRUE)
    }
    
    # Summarize the control totals for the data, based on whether the state is valid
    cat("\n\nControl totals - note that validState other than TRUE will be discarded\n(na.rm=TRUE)\n\n")
    df %>%
        mutate(n=1) %>%
        group_by(validState) %>%
        summarize_if(is.numeric, sum, na.rm=TRUE) %>%
        print()
    
    # Return the file, filtered to where validState is TRUE, and deleting variable validState
    df %>%
        filter(validState) %>%
        select(-validState)
    
}


# Function to download and process data (steps 2-4)
helperReadRunCTP_steps02_04 <- function(downloadTo, 
                                        isCDCDaily, 
                                        readFrom, 
                                        compareFile,
                                        dateChangePlot,
                                        dateMetricPrint,
                                        writeLog,
                                        ovrwriteLog, 
                                        stateData,
                                        perCapitaVarsMap=NULL
                                        ) {
    
    # FUNCTION ARGUMENTS:
    # downloadTo: download the most recent COVID Tracking Project data to this location
    #             NULL means do not download any data
    # isCDCDaily: boolean, are the data from CDC (default FALSE means use CTP)
    # readFrom: location for reading in the COVID Tracking Project data (defaults to donwloadTo)
    # compareFile: name of the file to use for comparisons when reading in raw data (NULL means no comparison)
    # dateChangePlot: boolean, should changes in dates be captured as a plot rather than as a list?
    # dateMetricPrint: boolean, should the changes by date and metric be printed to the main log?
    # writeLog: name of a separate log file for capturing detailed data on changes between files
    #           NULL means no detailed data captured
    # ovrwriteLog: boolean, should the log file be overwritten and started again from scratch?
    # stateData: state population data file
    # perCapitaVarsMap: mapVars to be used in helperMakePerCapita (NULL means create based on isCDCDaily)
    
    # Helper function for glimpsing
    glimpseFile <- function(x, txt) {
        cat(txt)
        glimpse(x)
    }
    
    # STEP 2a: Download latest COVID Tracking Project data (if requested)
    if (!is.null(downloadTo)) {
        
        # Use the proper API depending on whether the data are from CDC
        if (isTRUE(isCDCDaily)) {
            api <- api="https://data.cdc.gov/api/views/9mfq-cb36/rows.csv?accessType=DOWNLOAD"
        }
        else api <- "https://api.covidtracking.com/v1/states/daily.csv"
        
        # Download the relevant data to the proper location
        downloadCOVIDbyState(fileName=downloadTo, api=api)
    }
        
    # STEP 2b: Read-in COVID Tracking Project data
    # Use a different reading process for CDC daily data and CTP data
    if (isTRUE(isCDCDaily)) controlFields <- c("tot_cases", "tot_death", "new_case", "new_death")
    else controlFields <- c("positiveIncrease", "deathIncrease", "hospitalizedCurrently")
    dfRaw <- readCOViDbyState(readFrom, 
                              checkFile=compareFile, 
                              controlFields=controlFields,
                              dateChangePlot=dateChangePlot, 
                              dateMetricPrint=dateMetricPrint, 
                              writeLog=writeLog, 
                              ovrwriteLog=ovrwriteLog, 
                              isCDCDaily=isCDCDaily
                              )
    # Glimpse the raw data, either in the main log file or in a separate log file
    if (is.null(writeLog)) glimpseFile(dfRaw, txt="\nRaw data file:\n")
    else capture.output(glimpseFile(dfRaw, txt="\nRaw data file:\n"), file=writeLog, append=TRUE)
        
        
    # STEP 3: Process the data so that it includes all requested key variables
    # Change variables to keep and renames depending on data source
    if (isTRUE(isCDCDaily)) {
        varsFilter <- c("date", "state", "tot_cases", "tot_death", "new_case", "new_death")
        varsRename <- c(tot_death="tot_deaths", new_case="new_cases", new_death="new_deaths")
    } else {
        varsFilter <- c("date", "state", "positiveIncrease", "deathIncrease", 
                        "hospitalizedCurrently", "totalTestResultsIncrease"
                        )
        varsRename <- c(positiveIncrease="cases", 
                        deathIncrease="deaths", 
                        hospitalizedCurrently="hosp", 
                        totalTestResultsIncrease="tests"
                        )
    }
    # Run the filtering process using the parameters above
    dfFiltered <- processCVData(dfRaw, varsKeep=varsFilter, varsRename=varsRename, isCDCDaily=isCDCDaily)
    
    # Glimpse the processed data, either in the main log file or in a separate log file
    if (is.null(writeLog)) glimpseFile(dfFiltered, txt="\nFiltered data file:\n")
    else capture.output(glimpseFile(dfFiltered, txt="\nFiltered data file:\n"), file=writeLog, append=TRUE)
        
        
    # STEP 4: Convert to per capita
    # Create the appropriate perCapitaVarsMap if it has not been passed
    if (is.null(perCapitaVarsMap)) {
        if(isTRUE(isCDCDaily)) {
            perCapitaVarsMap <- c("new_cases"="cpm", "new_deaths"="dpm", 
                                  "tot_cases"="tcpm", "tot_deaths"="tdpm"
                                  )
        } else {
            perCapitaVarsMap <- c("cases"="cpm", "deaths"="dpm", "hosp"="hpm", "tests"="tpm")
        }
    }
    
    # Convert to per capita
    dfPerCapita <- helperMakePerCapita(dfFiltered, mapVars=perCapitaVarsMap, popData=stateData)
    
    # Glimpse the per-capita data, either in the main log file or in a separate log file
    if (is.null(writeLog)) glimpseFile(dfPerCapita, txt="\nPer capita data file:\n")
    else capture.output(glimpseFile(dfPerCapita, txt="\nPer capita data file:\n"), file=writeLog, append=TRUE)
    
    # Return the three key elements as a list
    list(dfRaw=dfRaw, dfFiltered=dfFiltered, dfPerCapita=dfPerCapita)
    
}


# Function to download/load, process, segment, and analyze data from COVID Tracking Project
readRunCOVIDTrackingProject <- function(thruLabel, 
                                        downloadTo=NULL, 
                                        readFrom=downloadTo, 
                                        compareFile=NULL,
                                        dateChangePlot=FALSE,
                                        dateMetricPrint=TRUE,
                                        writeLog=NULL,
                                        ovrwriteLog=TRUE,
                                        dfPerCapita=NULL,
                                        useClusters=NULL,
                                        hierarchical=TRUE,
                                        returnList=!isTRUE(hierarchical), 
                                        kCut=6,
                                        reAssignState=vector("list", 0),
                                        makeCumulativePlots=TRUE,
                                        skipAssessmentPlots=FALSE,
                                        isCDCDaily=FALSE,
                                        ...
                                        ) {
    
    # FUNCTION ARGUMENTS:
    # thruLabel: the label for when the data are through (e.g., "Aug 30, 2020")
    # downloadTo: download the most recent COVID Tracking Project data to this location
    #             NULL means do not download any data
    # readFrom: location for reading in the COVID Tracking Project data (defaults to donwloadTo)
    # compareFile: name of the file to use for comparisons when reading in raw data (NULL means no comparison)
    # dateChangePlot: boolean, should changes in dates be captured as a plot rather than as a list?
    # dateMetricPrint: boolean, should the changes by date and metric be printed to the main log?
    # writeLog: name of a separate log file for capturing detailed data on changes between files
    #           NULL means no detailed data captured
    # ovrwriteLog: boolean, should the log file be overwritten and started again from scratch?
    # dfPerCapita: file can be passed directly, which bypasses the loading and processing steps
    # useClusters: file containing clusters by state (NULL means make the clusters from the data)
    # hierarchical: boolean, should hierarchical clusters be produced (if FALSE, will be k-means)?
    # returnList: boolean, should a list be returned or just the cluster object?
    #             refers to what is returned by clusterStates(); the main function always returns a list
    # kCut: number of segments when cutting the hierarchical tree
    # reAssignState: mapping file for assigning a state to another state's cluster
    #                format list("stateToChange"="stateClusterToAssign")
    # makeCumulativePlots: whether to make plots of cumulative metrics
    # skipAssessmentPlots: boolean to skip the plots for assessClusters()
    #                      especially useful if just exploring dendrograms or silhouette widths
    # isCDCDaily: boolean, are the data from CDC (default FALSE means use CTP)
    # ...: arguments to be passed to clusterStates(), will be used only if useClusters is NULL
    
    
    # STEP 1: Get state data
    stateData <- getStateData()
    
    
    # STEPS 2-4 are run only if dfPerCapita does not exist
    if (is.null(dfPerCapita)) {
        # Call the helper function and pass the key arguments
        helpList <- helperReadRunCTP_steps02_04(downloadTo=downloadTo, 
                                                isCDCDaily=isCDCDaily, 
                                                readFrom=readFrom, 
                                                compareFile=compareFile, 
                                                dateChangePlot=dateChangePlot, 
                                                dateMetricPrint=dateMetricPrint, 
                                                writeLog=writeLog, 
                                                ovrwriteLog=ovrwriteLog, 
                                                stateData=stateData
                                                )
        # Extract the key data files from the helper list for future processing
        dfRaw <- helpList$dfRaw
        dfFiltered <- helpList$dfFiltered
        dfPerCapita <- helpList$dfPerCapita
    } else {
        # There is no raw or filtered data since a properly formatted dfPerCapita was passed
        dfRaw <- NULL
        dfFiltered <- NULL
    }
    
    # STEP 5: Create the clusters (if they have not been passed)
    if (is.null(useClusters)) {
        # Run the clustering process
        clData <- clusterStates(df=dfPerCapita, hierarchical=hierarchical, returnList=returnList, ...)
        # If hierarchical clusters, cut the tree, otherwise use the output object directly
        if (isTRUE(hierarchical)) {
            useClusters <- cutree(clData, k=kCut)
        } else {
            useClusters <- if(is.na(hierarchical)) clData$objCluster else clData$objCluster$cluster
        }
        # If requested, manually assign clusters to the cluster for another state
        for (xNum in seq_len(length(reAssignState))) {
            useClusters[names(reAssignState)[xNum]] <- useClusters[reAssignState[[xNum]]]
        }
        
    }
    
    # STEP 5a: Stop the process and return what is available if skipAssessmentPlots is TRUE
    if (skipAssessmentPlots) {
        return(list(stateData=stateData, 
                    dfRaw=dfRaw, 
                    dfFiltered=dfFiltered, 
                    dfPerCapita=dfPerCapita, 
                    useClusters=useClusters, 
                    plotData=NULL, 
                    consolidatedPlotData=NULL, 
                    clCum=NULL
                    )
               )
    }
    
    
    # STEP 6: Create the cluster assessments
    p4Def <- eval(formals(assessClusters)$p4Vars)  # eval(formals(fun)$arg) returns the default arg in fun
    plotData <- assessClusters(useClusters, 
                               dfState=stateData, 
                               dfBurden=dfPerCapita,
                               thruLabel=thruLabel,
                               plotsTogether=TRUE, 
                               p4Vars=if(isCDCDaily) c(cases="tot_cases", deaths="tot_deaths") else p4Def,
                               p4Fun=if(isCDCDaily) max else eval(formals(assessClusters)$p4Fun), 
                               isCDC=isCDCDaily
                               )
    
    
    # STEP 7: Plot the consolidated metrics
    if (isCDCDaily) {
        subT <- "new_cases: new cases, new_deaths: new deaths"
        subT <- paste0(subT, ", tot_cases: cumulative cases, tot_deaths: cumulative deaths")
        varMain=c("state", "cluster", "date", "pop", "new_cases", "new_deaths", "tot_cases", "tot_deaths")
    } else {
        subT <- "Cases: new cases, Deaths: new deaths, Hosp: total in hospital (not new), Tests: new tests"
        varMain <- c("state", "cluster", "date", "pop", "cases", "deaths", "hosp", "tests")
    }
    consolidatedPlotData <- plotConsolidatedMetrics(plotData, varMain=varMain, subT=subT, nrowPlot2=2)
    
    
    # STEP 8: Create cumulative metrics if requested
    if (makeCumulativePlots) {
        consPos <- consolidatedPlotData %>%
            ungroup() %>%
            select(state, cluster, date, name, vpm7) %>%
            arrange(state, cluster, date, name) %>%
            pivot_wider(-vpm7, names_from="name", values_from="vpm7") %>%
            mutate(pctpos=if(isTRUE(isCDCDaily)) 0 else cases/tests) %>%
            pivot_longer(-c(state, cluster, date), values_to="vpm7") %>%
            filter(!is.na(vpm7))
        typeKeep <- eval(formals(makeCumulative)$typeKeep)
        if(isTRUE(isCDCDaily)) typeKeep <- c("new_cases", "new_deaths", "tot_cases", "tot_deaths")
        clCum <- makeCumulative(consPos, typeKeep=typeKeep)
        # Cumulative plot of each variable
        a <- if(isTRUE(isCDCDaily)) c("tot_cases", "tot_deaths") else ""
        plotCumulativeData(clCum, keyMetricp2="", flagsp2="", makep1=TRUE, makep2=FALSE, namesOtherKeyVar=a)
        # Cumulative plots of relevant key variables
        a <- if(isTRUE(isCDCDaily)) c("new_deaths", "new_cases") else c("deaths", "cases", "tests")
        plotCumulativeData(clCum, keyMetricp2=a[1], flagsp2=findFlagStates(clCum, keyMetricVal=a[1]))
        plotCumulativeData(clCum, keyMetricp2=a[2], flagsp2=findFlagStates(clCum, keyMetricVal=a[2]))
        if(!isTRUE(isCDCDaily)) {
            plotCumulativeData(clCum, keyMetricp2=a[3], flagsp2=findFlagStates(clCum, keyMetricVal=a[3]))
        }
    } else {
        clCum <- NULL
    }
    
    
    # STEP 9: Return a list of the key data
    list(stateData=stateData, 
         dfRaw=dfRaw, 
         dfFiltered=dfFiltered, 
         dfPerCapita=dfPerCapita, 
         useClusters=useClusters, 
         plotData=plotData, 
         consolidatedPlotData=consolidatedPlotData, 
         clCum=clCum
    )
    
    
}

The function is then run using previously downloaded data:

# Test the reading process only (works provided date is managed OK)
# readCOViDbyState(fileName="./RInputFiles/Coronavirus/CDC_dc_downloaded_210414.csv", 
#                  checkFile=readFromRDS("cdcDaily_hier7_210414")$dfRaw %>%
#                      mutate(date=lubridate::mdy(submission_date)), 
#                  controlFields=c("tot_cases", "tot_death", "new_case", "new_death"),
#                  dateChangePlot=TRUE,
#                  isCDCDaily=TRUE
#                  )

# Test full process using April 14, 2021 data
locDownload <- "./RInputFiles/Coronavirus/CDC_dc_downloaded_210414.csv"
locRDS <- "cdcDaily_hier7_210414"
cdcDaily_rules7_210421 <- readRunCOVIDTrackingProject(thruLabel="Apr 14, 2021", 
                                                      readFrom=locDownload, 
                                                      compareFile=readFromRDS(locRDS)$dfRaw %>%
                                                          mutate(date=lubridate::mdy(submission_date)), 
                                                      isCDCDaily=TRUE,
                                                      hierarchical=NA,
                                                      shapeFunc=customTimeBucket,
                                                      minShape="2020-04", 
                                                      maxShape="2021-03", 
                                                      ratioDeathvsCase = 5, 
                                                      ratioTotalvsShape = 0.25, 
                                                      hmlSegs=3, 
                                                      eslSegs=3, 
                                                      seed=2104271501,
                                                      minDeath=100, 
                                                      minCase=10000, 
                                                      reAssignState=list("SD"="ND")
                                                      )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   submission_date = col_character(),
##   state = col_character(),
##   tot_cases = col_double(),
##   conf_cases = col_double(),
##   prob_cases = col_double(),
##   new_case = col_double(),
##   pnew_case = col_double(),
##   tot_death = col_double(),
##   conf_death = col_double(),
##   prob_death = col_double(),
##   new_death = col_double(),
##   pnew_death = col_double(),
##   created_at = col_character(),
##   consent_cases = col_character(),
##   consent_deaths = col_character()
## )
## 
## File is unique by state and date
## 
## 
## Overall control totals in file:
## # A tibble: 1 x 4
##    tot_cases tot_death new_case new_death
##        <dbl>     <dbl>    <dbl>     <dbl>
## 1 4503152844  96207327 30995591    546212
## 
## *** COMPARISONS TO REFERENCE FILE: compareFile
## 
## Checking for similarity of: column names
## In reference but not in current: submission_date
## In current but not in reference: 
## 
## Checking for similarity of: states
## In reference but not in current: 
## In current but not in reference: 
## 
## Checking for similarity of: dates
## In reference but not in current: 
## In current but not in reference: 
## 
## Difference of 5+ that is at least 1% (summed to date and metric): 0
## [1] date     name     newValue oldValue
## <0 rows> (or 0-length row.names)

## 
## 
## Difference of 5+ that is at least 1% (summed to state and metric): 0 
## [1] state    name     newValue oldValue
## <0 rows> (or 0-length row.names)
## 
## Raw data file:
## Rows: 26,820
## Columns: 15
## $ date           <date> 2021-01-01, 2020-04-30, 2020-02-26, 2020-03-05, 202...
## $ state          <chr> "FL", "IA", "UT", "GA", "WV", "NY", "GA", "TN", "AS"...
## $ tot_cases      <dbl> 1300528, 7145, 0, 2, 1224, 346492, 28196, 143937, 0,...
## $ conf_cases     <dbl> NA, NA, NA, NA, 1224, NA, 28182, 141000, NA, NA, NA,...
## $ prob_cases     <dbl> NA, NA, NA, NA, 0, NA, 14, 2937, NA, NA, NA, NA, 152...
## $ new_case       <dbl> 0, 302, 0, -5, 29, 5775, 602, 1854, 0, 0, 7598, 55, ...
## $ pnew_case      <dbl> 6063, 0, NA, NA, 0, 0, 1, 38, 0, 0, 0, 0, 781, 705, ...
## $ tot_death      <dbl> 21673, 162, 0, 0, 50, 10117, 1258, 1567, 0, 0, 11177...
## $ conf_death     <dbl> NA, NA, NA, NA, NA, NA, 1258, 1527, NA, NA, NA, NA, ...
## $ prob_death     <dbl> NA, NA, NA, NA, NA, NA, 0, 40, NA, NA, NA, NA, 5080,...
## $ new_death      <dbl> 0, 14, 0, 0, 0, 56, 47, 4, 0, 0, 229, 3, 60, 10, 3, ...
## $ pnew_death     <dbl> 7, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 6, 2, 1, 478, ...
## $ created_at     <chr> "01/02/2021 02:50:51 PM", "05/01/2020 09:00:19 PM", ...
## $ consent_cases  <chr> "Not agree", "Not agree", "Agree", "Agree", "Agree",...
## $ consent_deaths <chr> "Not agree", "Not agree", "Agree", "Agree", "Not agr...
## 
## 
## Control totals - note that validState other than TRUE will be discarded
## (na.rm=TRUE)
## 
## # A tibble: 2 x 6
##   validState  tot_cases tot_deaths new_cases new_deaths     n
##   <lgl>           <dbl>      <dbl>     <dbl>      <dbl> <dbl>
## 1 FALSE        19865962     363163    127395       2326  3576
## 2 TRUE       4483286882   95844164  30868196     543886 22797
## 
## Filtered data file:
## Rows: 22,797
## Columns: 6
## $ state      <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "A...
## $ date       <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ tot_cases  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tot_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ new_cases  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ new_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## 
## Per capita data file:
## Rows: 22,797
## Columns: 14
## $ state      <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "F...
## $ date       <date> 2020-01-22, 2020-01-22, 2020-01-22, 2020-01-22, 2020-01...
## $ tot_cases  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tot_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ new_cases  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ new_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cpm        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ dpm        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tcpm       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tdpm       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cpm7       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ dpm7       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tcpm7      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tdpm7      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...

## 
## Recency is defined as 2021-03-14 through current
## 
## Recency is defined as 2021-03-14 through current

The function is checked to confirm that it still works on legacy CTP data:

# Confirm that process still works for final CTP data
test_oldctp <- readRunCOVIDTrackingProject(thruLabel="Mar 31, 2021",
                                           readFrom="./RInputFiles/Coronavirus/CV_downloaded_210401.csv",
                                           compareFile=readFromRDS("ctp_hier6_210301")$dfRaw,
                                           hierarchical=TRUE,
                                           kCut=6,
                                           shapeFunc=customTimeBucket,
                                           minShape="2020-04",
                                           maxShape="2021-03",
                                           ratioDeathvsCase = 5,
                                           ratioTotalvsShape = 0.25,
                                           minDeath=100,
                                           minCase=10000
                                           )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   state = col_character(),
##   totalTestResultsSource = col_character(),
##   lastUpdateEt = col_character(),
##   dateModified = col_datetime(format = ""),
##   checkTimeEt = col_character(),
##   dateChecked = col_datetime(format = ""),
##   fips = col_character(),
##   dataQualityGrade = col_logical(),
##   hash = col_character(),
##   grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
## 
## File is unique by state and date
## 
## 
## Overall control totals in file:
## # A tibble: 1 x 3
##   positiveIncrease deathIncrease hospitalizedCurrently
##              <dbl>         <dbl>                 <dbl>
## 1         28756088        515148              20643306
## 
## *** COMPARISONS TO REFERENCE FILE: compareFile
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checking for similarity of: states
## In reference but not in current: 
## In current but not in reference: 
## 
## Checking for similarity of: dates
## In reference but not in current: 
## In current but not in reference: 2021-03-01 2021-03-02 2021-03-03 2021-03-04 2021-03-05 2021-03-06 2021-03-07
## 
## Difference of 5+ that is at least 1% (summed to date and metric): 0
## [1] date     name     newValue oldValue
## <0 rows> (or 0-length row.names)
## Warning: Removed 7 row(s) containing missing values (geom_path).

## 
## 
## Difference of 5+ that is at least 1% (summed to state and metric): 0 
## [1] state    name     newValue oldValue
## <0 rows> (or 0-length row.names)
## 
## Raw data file:
## Rows: 20,780
## Columns: 56
## $ date                        <date> 2021-03-07, 2021-03-07, 2021-03-07, 20...
## $ state                       <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive                    <dbl> 56886, 499819, 324818, 0, 826454, 35013...
## $ probableCases               <dbl> NA, 107742, 69092, NA, 56519, NA, 24786...
## $ negative                    <dbl> NA, 1931711, 2480716, 2140, 3073010, NA...
## $ pending                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestResultsSource      <chr> "totalTestsViral", "totalTestsPeopleVir...
## $ totalTestResults            <dbl> 1731628, 2323788, 2736442, 2140, 790810...
## $ hospitalizedCurrently       <dbl> 33, 494, 335, NA, 963, 4291, 326, 428, ...
## $ hospitalizedCumulative      <dbl> 1293, 45976, 14926, NA, 57907, NA, 2390...
## $ inIcuCurrently              <dbl> NA, NA, 141, NA, 273, 1159, NA, NA, 38,...
## $ inIcuCumulative             <dbl> NA, 2676, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently       <dbl> 2, NA, 65, NA, 143, NA, NA, NA, 16, NA,...
## $ onVentilatorCumulative      <dbl> NA, 1515, 1533, NA, NA, NA, NA, NA, NA,...
## $ recovered                   <dbl> NA, 295690, 315517, NA, NA, NA, NA, NA,...
## $ lastUpdateEt                <chr> "3/5/2021 03:59", "3/7/2021 11:00", "3/...
## $ dateModified                <dttm> 2021-03-05 03:59:00, 2021-03-07 11:00:...
## $ checkTimeEt                 <chr> "03/04 22:59", "03/07 06:00", "03/06 19...
## $ death                       <dbl> 305, 10148, 5319, 0, 16328, 54124, 5989...
## $ hospitalized                <dbl> 1293, 45976, 14926, NA, 57907, NA, 2390...
## $ hospitalizedDischarged      <dbl> NA, NA, NA, NA, 118932, NA, 23003, NA, ...
## $ dateChecked                 <dttm> 2021-03-05 03:59:00, 2021-03-07 11:00:...
## $ totalTestsViral             <dbl> 1731628, NA, 2736442, 2140, 7908105, 49...
## $ positiveTestsViral          <dbl> 68693, NA, NA, NA, NA, NA, NA, 323284, ...
## $ negativeTestsViral          <dbl> 1660758, NA, 2480716, NA, NA, NA, NA, 6...
## $ positiveCasesViral          <dbl> NA, 392077, 255726, NA, 769935, 3501394...
## $ deathConfirmed              <dbl> NA, 7963, 4308, NA, 14403, NA, 5251, 63...
## $ deathProbable               <dbl> NA, 2185, 1011, NA, 1925, NA, 735, 1377...
## $ totalTestEncountersViral    <dbl> NA, NA, NA, NA, NA, NA, 6415123, NA, 12...
## $ totalTestsPeopleViral       <dbl> NA, 2323788, NA, NA, 3842945, NA, 26165...
## $ totalTestsAntibody          <dbl> NA, NA, NA, NA, 580569, NA, 435053, NA,...
## $ positiveTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 63087, NA, NA, ...
## $ negativeTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 369995, NA, NA,...
## $ totalTestsPeopleAntibody    <dbl> NA, 119757, NA, NA, 444089, NA, NA, NA,...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen     <dbl> NA, NA, 481311, NA, NA, NA, NA, NA, NA,...
## $ positiveTestsPeopleAntigen  <dbl> NA, NA, 81803, NA, NA, NA, NA, NA, NA, ...
## $ totalTestsAntigen           <dbl> NA, NA, NA, NA, NA, NA, NA, 396680, NA,...
## $ positiveTestsAntigen        <dbl> NA, NA, NA, NA, NA, NA, NA, 22245, NA, ...
## $ fips                        <chr> "02", "01", "05", "60", "04", "06", "08...
## $ positiveIncrease            <dbl> 0, 408, 165, 0, 1335, 3816, 840, 0, 146...
## $ negativeIncrease            <dbl> 0, 2087, 3267, 0, 13678, 0, 0, 0, 0, 91...
## $ total                       <dbl> 56886, 2431530, 2805534, 2140, 3899464,...
## $ totalTestResultsIncrease    <dbl> 0, 2347, 3380, 0, 45110, 133186, 38163,...
## $ posNeg                      <dbl> 56886, 2431530, 2805534, 2140, 3899464,...
## $ dataQualityGrade            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ deathIncrease               <dbl> 0, -1, 22, 0, 5, 258, 3, 0, 0, 9, 66, 1...
## $ hospitalizedIncrease        <dbl> 0, 0, 11, 0, 44, 0, 18, 0, 0, 0, 92, 35...
## $ hash                        <chr> "dc4bccd4bb885349d7e94d6fed058e285d4be1...
## $ commercialScore             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## 
## 
## Control totals - note that validState other than TRUE will be discarded
## (na.rm=TRUE)
## 
## # A tibble: 2 x 6
##   validState    cases deaths     hosp     tests     n
##   <lgl>         <dbl>  <dbl>    <dbl>     <dbl> <dbl>
## 1 FALSE        111926   2219   114050    596489  1785
## 2 TRUE       28644162 512929 20529256 363227513 18995
## 
## Filtered data file:
## Rows: 18,995
## Columns: 6
## $ state  <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", ...
## $ date   <date> 2020-03-06, 2020-03-07, 2020-03-08, 2020-03-09, 2020-03-10,...
## $ cases  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 5, 3, 3, 1, 10, 13, 4, 7...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
## $ hosp   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tests  <dbl> 0, 4, 2, 9, 0, 23, 0, 14, 84, 0, 0, 193, 75, 26, 260, 74, 19...
## 
## Per capita data file:
## Rows: 18,995
## Columns: 14
## $ state  <chr> "WA", "WA", "WA", "WA", "WA", "WA", "WA", "WA", "WA", "MA", ...
## $ date   <date> 2020-01-13, 2020-01-14, 2020-01-15, 2020-01-16, 2020-01-17,...
## $ cases  <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tests  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...
## $ cpm    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000...
## $ dpm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tpm    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000...
## $ cpm7   <dbl> NA, NA, NA, 0.01876023, 0.01876023, 0.03752046, 0.03752046, ...
## $ dpm7   <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, NA, 0, NA, 0, NA, 0, 0, 0, 0, ...
## $ hpm7   <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, NA, 0, NA, 0, NA, 0, 0, 0, 0, ...
## $ tpm7   <dbl> NA, NA, NA, 0.00000000, 0.00000000, 0.00000000, 0.00000000, ...

## 
## Recency is defined as 2021-02-06 through current
## 
## Recency is defined as 2021-02-06 through current

Next steps are to search for hospital and test data that can be integrated to more closely replicate what was available from CTP.

Hospital data are downloaded, with the process cached to avoid repeated hits against the CDC website:

downloadCOVIDbyState("./RInputFiles/Coronavirus/CDC_h_downloaded_210429.csv",
                     api="https://beta.healthdata.gov/api/views/g62h-syeh/rows.csv?accessType=DOWNLOAD", 
                     ovrWrite=FALSE
                     )
##                                                          size isdir mode
## ./RInputFiles/Coronavirus/CDC_h_downloaded_210429.csv 6437517 FALSE  666
##                                                                     mtime
## ./RInputFiles/Coronavirus/CDC_h_downloaded_210429.csv 2021-04-29 08:50:43
##                                                                     ctime
## ./RInputFiles/Coronavirus/CDC_h_downloaded_210429.csv 2021-04-29 08:50:37
##                                                                     atime exe
## ./RInputFiles/Coronavirus/CDC_h_downloaded_210429.csv 2021-04-29 08:50:43  no

Fields available in the dataset are described at healthdata.gov. Salient fields include:

  • state - state (includes 50 states, DC, PR, VI)
  • date - date of reported data
  • inpatient_beds_used_covid - reported patients currently hospitalized with confirmed or suspected COVID-19
  • total_adult_patients_hospitalized_confirmed_and_suspected_covid - reported patients in adult inpatient beds with confirmed or suspected COVID-19 (includes observation beds)
  • total_pediatric_patients_hospitalized_confirmed_and_suspected_covid - reported patients in pediatric inpatient beds with confirmed or suspected COVID-19 (includes observation beds)
  • staffed_icu_adult_patients_confirmed_and_suspected_covid - reported patients in an adult ICU bed who have suspected or confirmed COVID-19
  • previous_day_admission_[pediatric|adult]covid[suspected|confirmed] - previous day admissions to a hospital with confirmed or suspected CoVID-19

Data availability vary over time, and there are also coverage variables included for each of the key statistics. Data are loaded, and a plot of data availability by state and metric is created:

# Read and glimpse downloaded hospital file
hospRaw <- readr::read_csv("./RInputFiles/Coronavirus/CDC_h_downloaded_210429.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   state = col_character(),
##   date = col_date(format = ""),
##   geocoded_state = col_character()
## )
## i Use `spec()` for the full column specifications.
glimpse(hospRaw)
## Rows: 22,435
## Columns: 61
## $ state                                                                        <chr> ...
## $ date                                                                         <date> ...
## $ critical_staffing_shortage_today_yes                                         <dbl> ...
## $ critical_staffing_shortage_today_no                                          <dbl> ...
## $ critical_staffing_shortage_today_not_reported                                <dbl> ...
## $ critical_staffing_shortage_anticipated_within_week_yes                       <dbl> ...
## $ critical_staffing_shortage_anticipated_within_week_no                        <dbl> ...
## $ critical_staffing_shortage_anticipated_within_week_not_reported              <dbl> ...
## $ hospital_onset_covid                                                         <dbl> ...
## $ hospital_onset_covid_coverage                                                <dbl> ...
## $ inpatient_beds                                                               <dbl> ...
## $ inpatient_beds_coverage                                                      <dbl> ...
## $ inpatient_beds_used                                                          <dbl> ...
## $ inpatient_beds_used_coverage                                                 <dbl> ...
## $ inpatient_beds_used_covid                                                    <dbl> ...
## $ inpatient_beds_used_covid_coverage                                           <dbl> ...
## $ previous_day_admission_adult_covid_confirmed                                 <dbl> ...
## $ previous_day_admission_adult_covid_confirmed_coverage                        <dbl> ...
## $ previous_day_admission_adult_covid_suspected                                 <dbl> ...
## $ previous_day_admission_adult_covid_suspected_coverage                        <dbl> ...
## $ previous_day_admission_pediatric_covid_confirmed                             <dbl> ...
## $ previous_day_admission_pediatric_covid_confirmed_coverage                    <dbl> ...
## $ previous_day_admission_pediatric_covid_suspected                             <dbl> ...
## $ previous_day_admission_pediatric_covid_suspected_coverage                    <dbl> ...
## $ staffed_adult_icu_bed_occupancy                                              <dbl> ...
## $ staffed_adult_icu_bed_occupancy_coverage                                     <dbl> ...
## $ staffed_icu_adult_patients_confirmed_and_suspected_covid                     <dbl> ...
## $ staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage            <dbl> ...
## $ staffed_icu_adult_patients_confirmed_covid                                   <dbl> ...
## $ staffed_icu_adult_patients_confirmed_covid_coverage                          <dbl> ...
## $ total_adult_patients_hospitalized_confirmed_and_suspected_covid              <dbl> ...
## $ total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage     <dbl> ...
## $ total_adult_patients_hospitalized_confirmed_covid                            <dbl> ...
## $ total_adult_patients_hospitalized_confirmed_covid_coverage                   <dbl> ...
## $ total_pediatric_patients_hospitalized_confirmed_and_suspected_covid          <dbl> ...
## $ total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage <dbl> ...
## $ total_pediatric_patients_hospitalized_confirmed_covid                        <dbl> ...
## $ total_pediatric_patients_hospitalized_confirmed_covid_coverage               <dbl> ...
## $ total_staffed_adult_icu_beds                                                 <dbl> ...
## $ total_staffed_adult_icu_beds_coverage                                        <dbl> ...
## $ inpatient_beds_utilization                                                   <dbl> ...
## $ inpatient_beds_utilization_coverage                                          <dbl> ...
## $ inpatient_beds_utilization_numerator                                         <dbl> ...
## $ inpatient_beds_utilization_denominator                                       <dbl> ...
## $ percent_of_inpatients_with_covid                                             <dbl> ...
## $ percent_of_inpatients_with_covid_coverage                                    <dbl> ...
## $ percent_of_inpatients_with_covid_numerator                                   <dbl> ...
## $ percent_of_inpatients_with_covid_denominator                                 <dbl> ...
## $ inpatient_bed_covid_utilization                                              <dbl> ...
## $ inpatient_bed_covid_utilization_coverage                                     <dbl> ...
## $ inpatient_bed_covid_utilization_numerator                                    <dbl> ...
## $ inpatient_bed_covid_utilization_denominator                                  <dbl> ...
## $ adult_icu_bed_covid_utilization                                              <dbl> ...
## $ adult_icu_bed_covid_utilization_coverage                                     <dbl> ...
## $ adult_icu_bed_covid_utilization_numerator                                    <dbl> ...
## $ adult_icu_bed_covid_utilization_denominator                                  <dbl> ...
## $ adult_icu_bed_utilization                                                    <dbl> ...
## $ adult_icu_bed_utilization_coverage                                           <dbl> ...
## $ adult_icu_bed_utilization_numerator                                          <dbl> ...
## $ adult_icu_bed_utilization_denominator                                        <dbl> ...
## $ geocoded_state                                                               <chr> ...
# Plot of states with data availability by metric and month
hospRaw %>% 
    mutate(tb=customTimeBucket(date)) %>% 
    group_by(state, tb) %>% 
    summarize_if(is.numeric, sum, na.rm=TRUE) %>% 
    group_by(tb) %>% 
    summarize_all(.funs=function(x) sum(x>0)) %>% 
    pivot_longer(-tb) %>% 
    ggplot(aes(x=tb, y=name)) + 
    geom_tile(aes(fill=value)) + 
    geom_text(aes(label=value)) + 
    scale_fill_continuous(low="white", high="green")

# Check state coverage
hospRaw %>%
    count(state) %>%
    pull(state) %>%
    setdiff(state.abb)
## [1] "DC" "PR" "VI"

There are 53 states included (the 50 US states plus DC, PR, VI). Many of the salient metrics became available in July 2020, though inpatient_beds_used_covid appears to be available throughout. Three key fields are explored in more detail:

# Hospital summary by date
hospSum <- hospRaw %>%
    select(date, 
           inp=inpatient_beds_used_covid, 
           hosp_adult=total_adult_patients_hospitalized_confirmed_and_suspected_covid, 
           hosp_ped=total_pediatric_patients_hospitalized_confirmed_and_suspected_covid
           ) %>%
    group_by(date) %>%
    summarize_all(sum, na.rm=TRUE)

# Summary of 'hosp' field from CTP
hospCTP <- ctp_hier6_210401$dfFiltered %>%
    group_by(date) %>%
    summarize(hosp_ctp=sum(hosp, na.rm=TRUE), .groups="drop")

# Plotted by date, compared with field 'hosp' from CTP
hospSum %>%
    full_join(hospCTP, by="date") %>%
    pivot_longer(-date) %>%
    filter(!is.na(value)) %>%
    ggplot(aes(x=date, y=value)) + 
    geom_line(aes(group=name, color=name)) + 
    labs(title="COVID Hospitalized by Data Source", x="", y="Hospitalized") + 
    scale_color_discrete("Metric")

COVID Tracking Project ran a data cleaning algorithm, resulting in some differences. The field “inp” appears to track the CTP metric reasonably well, and the alignment by state for April 2020 through February 2021 is explored:

hospCompare <- hospRaw %>%
    select(state, 
           date, 
           inp=inpatient_beds_used_covid
           ) %>%
    full_join(select(ctp_hier6_210401$dfFiltered, state, date, ctp=hosp), by=c("state", "date")) %>%
    filter(state %in% c(state.abb, "DC"), date >= as.Date("2020-04-01"), date <= as.Date("2021-02-28"))

# Get correlations by state (only for dates where both supply data)
hospCompare %>%
    is.na() %>%
    colSums()
## state  date   inp   ctp 
##     0     0     0   807
hospCorr <- hospCompare %>%
    filter(!is.na(ctp)) %>%
    group_by(state) %>%
    summarize(rmse=sqrt(mean((inp-ctp)**2)), 
              corr=cor(inp, ctp), 
              inp=mean(inp), 
              ctp=mean(ctp),
              rsq=1-rmse/ctp, 
              .groups="drop"
              )

# Plot states by correlation, rsq, and RMSE
hospCorr %>%
    select(state, rmse, corr, .pseudorsq=rsq) %>%
    pivot_longer(-state) %>%
    ggplot(aes(x=fct_reorder(state, value, .fun=min), y=value)) + 
    geom_col(fill="lightblue") + 
    coord_flip() +
    facet_wrap(~name, scales="free_x")

# Plot by individual states, with 'most different' first
hospDiff <- hospCorr %>%
    arrange(rsq) %>%
    pull(state)

hospCompare %>%
    mutate(state=factor(state, levels=hospDiff)) %>%
    pivot_longer(-c(state, date)) %>%
    filter(!is.na(value)) %>%
    ggplot(aes(x=date, y=value)) + 
    geom_line(aes(group=name, color=name)) + 
    facet_wrap(~state, scales="free_y")

While there are some meaningful disconnects in the data, particularly in the earlier time periods, there is generally good correlation of the timing and magnitude of peaks and troughs. The new hospitalized data appear reasonable for usage with the CDC daily data for purposes of highlighting macro trends in disease impact by state. For potential future exploration, the CTP data could be used through 2020 with the new data picked up starting in 2021.

Hospital data are manually added to the existing dfPerCapita file:

# Create the hospital data file
hospData <- hospRaw %>%
    select(state, date, inp=inpatient_beds_used_covid) %>%
    filter(state %in% c(state.abb, "DC"), !is.na(inp)) %>%
    arrange(date, state)

# Check that there are no duplicates
if (hospData %>% select(state, date) %>% anyDuplicated() != 0) stop("\nDuplicates in the hospital data\n")

# Create hpm (per capita) and hpm7 (per capita rolling 7) variables
hospDataPC <- hospData %>%
    helperMakePerCapita(mapVars=c("inp"="hpm"), popData=getStateData())
hospDataPC %>% summarize(across(where(is.numeric), sum, na.rm=TRUE))
## # A tibble: 1 x 3
##        inp      hpm     hpm7
##      <dbl>    <dbl>    <dbl>
## 1 24939753 3513219. 3491948.
# Create a full per-capita file, taking only state-date combinations that are in dfPerCapita
dfPerCapitaTemp <- cdcDaily_rules7_210421$dfPerCapita %>%
    left_join(hospDataPC, by=c("state", "date"))
dfPerCapitaTemp %>% summarize(across(where(is.numeric), sum, na.rm=TRUE))
## # A tibble: 1 x 15
##   tot_cases tot_deaths new_cases new_deaths    cpm    dpm   tcpm   tdpm   cpm7
##       <dbl>      <dbl>     <dbl>      <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1    4.48e9   95844164  30868196     543886 4.76e6 79001. 6.92e8 1.34e7 4.74e6
## # ... with 6 more variables: dpm7 <dbl>, tcpm7 <dbl>, tdpm7 <dbl>, inp <dbl>,
## #   hpm <dbl>, hpm7 <dbl>

The file can then be run through the main function, with the per capita file and existing clusters passed:

# Test full process using hospital data
updDaily_rules7_210421 <- readRunCOVIDTrackingProject(thruLabel="Apr 14, 2021", 
                                                      isCDCDaily=TRUE,
                                                      dfPerCapita=dfPerCapitaTemp, 
                                                      useClusters=cdcDaily_rules7_210421$useClusters
                                                      )

The main function requires updates so that it can optionally use the hospital (or any other) data:

# Create a subtitle mapping file (variable to descriptive name)
subTMapper <- c("new_cases"="new cases", 
                "new_deaths"="new deaths", 
                "tot_cases"="cumulative cases", 
                "tot_deaths"="cumulative deaths", 
                "inp"="total hospitalized (not new or cumulative)", 
                "cases"="new cases", 
                "deaths"="new deaths", 
                "hosp"="total in hospital (not new)", 
                "tests"="new tests"
                )


# Helper function to make consolidated cumulative data
helperConsCum <- function(df, 
                          byVars=c("state", "cluster", "date"),
                          nameVar=c("name"),
                          numVar=c("vpm7"), 
                          makePctPos=FALSE
                          ) {
    df %>%
        ungroup() %>%
        select_at(vars(all_of(c(byVars, nameVar, numVar)))) %>%
        arrange_at(vars(all_of(c(byVars, nameVar)))) %>%
        pivot_wider(-all_of(numVar), names_from=nameVar, values_from=numVar) %>%
        mutate(pctpos=if(isTRUE(makePctPos)) cases/tests else 0) %>%
        pivot_longer(-c(all_of(byVars)), values_to=numVar) %>%
        filter(!is.na(get(numVar)))
}


# Function to download/load, process, segment, and analyze data from COVID Tracking Project
readRunCOVIDTrackingProject <- function(thruLabel, 
                                        downloadTo=NULL, 
                                        readFrom=downloadTo, 
                                        compareFile=NULL,
                                        dateChangePlot=FALSE,
                                        dateMetricPrint=TRUE,
                                        writeLog=NULL,
                                        ovrwriteLog=TRUE,
                                        dfPerCapita=NULL,
                                        useClusters=NULL,
                                        hierarchical=TRUE,
                                        returnList=!isTRUE(hierarchical), 
                                        kCut=6,
                                        reAssignState=vector("list", 0),
                                        makeCumulativePlots=TRUE,
                                        skipAssessmentPlots=FALSE,
                                        varMainS7=NULL,
                                        mapperS7=subTMapper,
                                        typeKeepS8=NULL,
                                        isCDCDaily=FALSE,
                                        ...
                                        ) {
    
    # FUNCTION ARGUMENTS:
    # thruLabel: the label for when the data are through (e.g., "Aug 30, 2020")
    # downloadTo: download the most recent COVID Tracking Project data to this location
    #             NULL means do not download any data
    # readFrom: location for reading in the COVID Tracking Project data (defaults to donwloadTo)
    # compareFile: name of the file to use for comparisons when reading in raw data (NULL means no comparison)
    # dateChangePlot: boolean, should changes in dates be captured as a plot rather than as a list?
    # dateMetricPrint: boolean, should the changes by date and metric be printed to the main log?
    # writeLog: name of a separate log file for capturing detailed data on changes between files
    #           NULL means no detailed data captured
    # ovrwriteLog: boolean, should the log file be overwritten and started again from scratch?
    # dfPerCapita: file can be passed directly, which bypasses the loading and processing steps
    # useClusters: file containing clusters by state (NULL means make the clusters from the data)
    # hierarchical: boolean, should hierarchical clusters be produced (if FALSE, will be k-means)?
    # returnList: boolean, should a list be returned or just the cluster object?
    #             refers to what is returned by clusterStates(); the main function always returns a list
    # kCut: number of segments when cutting the hierarchical tree
    # reAssignState: mapping file for assigning a state to another state's cluster
    #                format list("stateToChange"="stateClusterToAssign")
    # makeCumulativePlots: whether to make plots of cumulative metrics
    # skipAssessmentPlots: boolean to skip the plots for assessClusters()
    #                      especially useful if just exploring dendrograms or silhouette widths
    # varMainS7: main variables to use for step 7 (consolidated metrics): NULL means assume CTP defaults
    # mapperS7: mapping file for step 7 (variables to descriptive names)
    # typeKeepS8: variables to be plotted in step 8 (NULL means assume CTP defaults)
    # isCDCDaily: boolean, are the data from CDC (default FALSE means use CTP)
    # ...: arguments to be passed to clusterStates(), will be used only if useClusters is NULL
    
    
    # STEP 1: Get state data
    stateData <- getStateData()
    
    # STEPS 2-4 are run only if dfPerCapita does not exist
    if (is.null(dfPerCapita)) {
        # Call the helper function and pass the key arguments
        helpList <- helperReadRunCTP_steps02_04(downloadTo=downloadTo, 
                                                isCDCDaily=isCDCDaily, 
                                                readFrom=readFrom, 
                                                compareFile=compareFile, 
                                                dateChangePlot=dateChangePlot, 
                                                dateMetricPrint=dateMetricPrint, 
                                                writeLog=writeLog, 
                                                ovrwriteLog=ovrwriteLog, 
                                                stateData=stateData
                                                )
        # Extract the key data files from the helper list for future processing
        dfRaw <- helpList$dfRaw
        dfFiltered <- helpList$dfFiltered
        dfPerCapita <- helpList$dfPerCapita
    } else {
        # There is no raw or filtered data since a properly formatted dfPerCapita was passed
        dfRaw <- NULL
        dfFiltered <- NULL
    }
    
    # STEP 5: Create the clusters (if they have not been passed)
    if (is.null(useClusters)) {
        # Run the clustering process
        clData <- clusterStates(df=dfPerCapita, hierarchical=hierarchical, returnList=returnList, ...)
        # If hierarchical clusters, cut the tree, otherwise use the output object directly
        if (isTRUE(hierarchical)) {
            useClusters <- cutree(clData, k=kCut)
        } else {
            useClusters <- if(is.na(hierarchical)) clData$objCluster else clData$objCluster$cluster
        }
        # If requested, manually assign clusters to the cluster for another state
        for (xNum in seq_len(length(reAssignState))) {
            useClusters[names(reAssignState)[xNum]] <- useClusters[reAssignState[[xNum]]]
        }
        
    }
    
    # STEP 5a: Stop the process and return what is available if skipAssessmentPlots is TRUE
    if (skipAssessmentPlots) {
        return(list(stateData=stateData, 
                    dfRaw=dfRaw, 
                    dfFiltered=dfFiltered, 
                    dfPerCapita=dfPerCapita, 
                    useClusters=useClusters, 
                    plotData=NULL, 
                    consolidatedPlotData=NULL, 
                    clCum=NULL
                    )
               )
    }
    
    # STEP 6: Create the cluster assessments using cases and deaths
    p4Def <- eval(formals(assessClusters)$p4Vars)  # eval(formals(fun)$arg) returns the default arg in fun
    plotData <- assessClusters(useClusters, 
                               dfState=stateData, 
                               dfBurden=dfPerCapita,
                               thruLabel=thruLabel,
                               plotsTogether=TRUE, 
                               p4Vars=if(isCDCDaily) c(cases="tot_cases", deaths="tot_deaths") else p4Def,
                               p4Fun=if(isCDCDaily) max else eval(formals(assessClusters)$p4Fun), 
                               isCDC=isCDCDaily
                               )
    
    
    # STEP 7: Plot the consolidated metrics
    # Create CTP defaults if passed as NULL
    if (is.null(varMainS7)) 
        varMainS7 <- c("state", "cluster", "date", "pop", "cases", "deaths", "hosp", "tests")
    # Create description for subtitle
    subT <- varMainS7[varMainS7 %in% names(subTMapper)] %>% 
        paste(., subTMapper[.] %>% unname, sep=": ") %>% 
        paste0(collapse=", ")
    # Create consolidated metrics plot
    consolidatedPlotData <- plotConsolidatedMetrics(plotData, varMain=varMainS7, subT=subT, nrowPlot2=2)
    
    
    # STEP 8: Create cumulative metrics if requested
    if (makeCumulativePlots) {
        consPos <- helperConsCum(consolidatedPlotData)
        if (is.null(typeKeepS8)) typeKeepS8 <- eval(formals(makeCumulative)$typeKeep)
        clCum <- makeCumulative(consPos, typeKeep=typeKeepS8)
        # Cumulative plot of each variable (for CDC daily, tot_cases and tot_deaths are already cumulative)
        a <- if(isTRUE(isCDCDaily)) c("tot_cases", "tot_deaths") else ""
        plotCumulativeData(clCum, keyMetricp2="", flagsp2="", makep1=TRUE, makep2=FALSE, namesOtherKeyVar=a)
        # Cumulative plots of relevant key variables
        a <- if(isTRUE(isCDCDaily)) c("new_deaths", "new_cases") else c("deaths", "cases", "tests")
        for (vrbl in a) 
            plotCumulativeData(clCum, keyMetricp2=vrbl, flagsp2=findFlagStates(clCum, keyMetricVal=vrbl))
    } else {
        clCum <- NULL
    }
    
    
    # STEP 9: Return a list of the key data
    list(stateData=stateData, 
         dfRaw=dfRaw, 
         dfFiltered=dfFiltered, 
         dfPerCapita=dfPerCapita, 
         useClusters=useClusters, 
         plotData=plotData, 
         consolidatedPlotData=consolidatedPlotData, 
         clCum=clCum
    )
    
}

The routine is then run:

# Test full process using hospital data
updDaily_rules7_210421 <- readRunCOVIDTrackingProject(thruLabel="Apr 14, 2021", 
                                                      dfPerCapita=dfPerCapitaTemp, 
                                                      useClusters=cdcDaily_rules7_210421$useClusters, 
                                                      varMainS7=c("state", "cluster", "date", "pop", 
                                                                  "new_cases", "new_deaths", 
                                                                  "tot_cases", "tot_deaths", "inp"
                                                                  ), 
                                                      typeKeepS8=c("new_cases", "new_deaths", 
                                                                   "tot_cases", "tot_deaths"
                                                                   ), 
                                                      isCDCDaily=TRUE
                                                      )

## 
## Recency is defined as 2021-03-14 through current
## 
## Recency is defined as 2021-03-14 through current

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(nameVar)` instead of `nameVar` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(numVar)` instead of `numVar` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

Next steps are to update the function so that it can download and integrate data from multiple sources (for CDC daily which keeps cases/deaths, hospitalizations, and other metrics in separate file).

The initial portions of steps 2-4 are split in to several smaller functions for easier use:

  • fileDownload - downloads a file and saves to a location, checking not to overwrite
  • fileRead - reads a file and outputs as a data frame
  • colRenamer - renames any number of columns using a named vector c(‘current name’=‘new name’)
  • colMutater - applies functions and transformations to a data frame, passed as a list
  • checkUniqueRows - checks that a file is unique by designated keys
  • checkControl - gets control totals for a file fr a specified unique key and set of variables

This data can then be used to assess differences in a current file and reference file. Code includes:

# Generic function to download data given a download location and URL
fileDownload <- function(fileName, 
                         url, 
                         ovrWrite=FALSE, 
                         ...
                         ) {
    
    # FUNCTION ARGUMENTS:
    # fileName: the filename that the data will be saved to
    # url: the URL to pull the data from
    # ovrWrite: whether to allow overwriting of the existing fileName
    # ...: other arguments to pass to download.file
    
    # Check whether fileName already exists
    if (file.exists(fileName)) {
        cat("\nFile already exists at:", fileName, "\n")
        if (ovrWrite) cat("Will over-write with current data from", url, "\n")
        else stop("Exiting due to ovrWrite=FALSE and a duplicate fileName\n")
    }
    
    # Download the file 
    download.file(url, destfile=fileName, ...)
    
    # Show statistics on downloaded file
    file.info(fileName)
    
}


# Generic function to read an existing file
fileRead <- function(fileName, 
                     fnRead=readr::read_csv, 
                     ...
                     ) {
    
    # FUNCTION ARGUMENTS:
    # fileName: file location for reading
    # fnRead: function for reading fileName
    # ...: other arguments to be passed to fnRead
    
    # Read the file and return
    fnRead(fileName, ...)
    
}


# Generic function to rename columns in a file using an input vector
colRenamer <- function(df, 
                       vecRename=c(), 
                       ...
                       ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame or tibble
    # vecRename: vector for renaming c('existing name'='new name'), can be any length from 0 to ncol(df)
    # ...: additional arguments to be passed to rename_with
    
    # Rename the columns as requested
    dplyr::rename_with(df, .fn=function(x) vecRename[x], .cols=names(vecRename), ...)
    
}


# Generic function to select columns in a file using an input vector
colSelector <- function(df, 
                        vecSelect=NULL, 
                        ...
                        ) {

    # FUNCTION ARGUMENTS:    
    # df: the data frame or tibble
    # vecSelect: vector for variables to keep c('keep1', "keep2", ...), NULL means keep all
    # ...: additional arguments (not currently used)
    
    # If vecSelect is NULL, keep all the columns
    if (is.null(vecSelect)) vecSelect <- names(df)
    
    # Keep the requested columns
    select(df, all_of(vecSelect))
    
}


# Generic function to mutate columns in a file using an input list
colMutater <- function(df, 
                       selfList=list(), 
                       fullList=list(),
                       ...
                       ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame or tibble
    # selfList: list for functions to apply to self, list('variable'=fn) will apply variable=fn(variable)
    #           processed in order, so more than one function can be applied to self
    # fullList: list for general functions to be applied, list('new variable'=expression(code))
    #           will create 'new variable' as eval(expression(code))
    #           for now, requires passing an expression
    # ...: additional arguments to be passed to across() inside mutate()
    
    # Apply the self-functions sequentially
    for (ctr in seq_along(selfList)) 
        df <- mutate(df, across(.cols=names(selfList)[ctr], .fns=selfList[[ctr]], ...))
    
    # Apply the full-mutates sequentially
    for (ctr in seq_along(fullList))
        df <- mutate(df, !!names(fullList)[ctr]:=eval(fullList[[ctr]]))
        
    # Return the updated file
    df
    
}


# Generic function to check for uniqueness of rows in a tibble
checkUniqueRows <- function(df, 
                            uniqueBy=NULL, 
                            severity="stop", 
                            noteUnique=TRUE,
                            returnDF=TRUE
                            ) {
    
    # FUNCTION ARGUMENTS
    # df: tibble or data frame
    # uniqueBy: combination of variables for checking uniqueness (NULL means use all)
    # severity: passed to assertive, can be c("stop", "warning", "message", "none")
    # noteUnique: boolean, should a note be generated showing that the uniqueness check passed?
    # returnDF: should the data frame be returned (if TRUE, will alow for downstream chaining)
    
    # Use all variables if uniqueBy is NULL
    if (is.null(uniqueBy)) uniqueBy <- names(df)
    
    # Check for uniqueness
    df %>%
        select_at(vars(all_of(uniqueBy))) %>%
        assertive.properties::assert_has_no_duplicates(severity=severity)
    
    # Report back on findings if requested
    if (isTRUE(noteUnique)) cat("\n*** File", "has been checked for uniqueness by:", uniqueBy, "\n\n")
    
    # Return the data frame if requested
    if (returnDF) return(df)
    
}


# Generic function for checking control totals
checkControl <- function(df, 
                         groupBy=c(),
                         useVars=NULL, 
                         fn=sum, 
                         printControls=TRUE,
                         pivotData=!isTRUE(printControls),
                         returnData=!isTRUE(printControls),
                         ...
                         ) {
    
    # FUNCTION ARGUMENTS
    # df: the data frame or tibble
    # groupBy: control totals by level (c() means overall total)
    # useVars: variables to get control totals (NULL means all numeric)
    # fn: function that will be applied to create control totals
    # printControls: boolean, should the control file be printed?
    # pivotData: boolean, should data be pivoted so to be unique by groupBy with columns name and newValue?
    # returnData: boolean, should the control total data be returned?
    # ...: additional arguments (most common will be na.rm=TRUE)
    
    # Get the columns to summarize (use all numeric non-grouping variables if NULL passed)
    if (is.null(useVars)) useVars <- setdiff(names(df)[sapply(df, is.numeric)], groupBy)
    
    # Get the control totals by group
    dfControl <- df %>%
        group_by_at(vars(all_of(groupBy))) %>%
        summarize(across(.cols=all_of(useVars), .fns=fn, ...), .groups="drop")
    
    # Pivot data if requested
    if (pivotData) dfControl <- dfControl %>% pivot_longer(all_of(useVars), values_to="newValue")
    
    # Print control totals if requested
    if (printControls) print(dfControl)
    
    # Return data file if requested
    if (returnData) return(dfControl)
    
}


# Generic function for filtering rows based on criteria
rowFilter <- function(df, 
                      lstFilter=list(), 
                      ...
                      ) {
    
    # FUNCTION ARGUMENTS
    # df: tibble or data frame
    # lstFilter: a list for filtering records, of form list("field"=c("allowed values"))
    # ...: additional arguments (not currently used)
    
    # Run the filtering for each element of lstFilter
    for (colName in names(lstFilter)) {
        df <- df %>% filter(.data[[colName]] %in% lstFilter[[colName]])
    }
    
    # Return the filtered data frame
    df
    
}


# Generic function for combining rows to a single row
combineRows <- function(df, 
                        comboVar, 
                        uqVars=c(),
                        vecCombo=c(), 
                        fn=sum, 
                        ...
                        ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame or tibble
    # comboVar: character string representing variable for being combined
    # uqVars: other variables that the final data should be unique by 
    #         (e.g., date for a state-date file mapping states together)
    # vecCombo: vector of combinations to be made of form c('old value'='new value')
    # fn: function for combining elements (only handles is.numeric for now)
    # ...: other arguments passed to fn
    
    # Add comboVar to uqVars if it is not already included
    if (!(comboVar %in% uqVars)) uqVars <- c(uqVars, comboVar)
    
    # Split data file in to a portion that needs modifying and a portfio for standalone
    df <- df %>%
        mutate(dummyVar=ifelse(get(comboVar) %in% names(vecCombo), 1, 0))
    dfKeep <- df %>% filter(dummyVar==0) %>% select(-dummyVar)
    dfMod <- df %>% filter(dummyVar==1) %>% select(-dummyVar)
    
    # Convert elements as appropriate
    dfMod <- dfMod %>%
        mutate(!!comboVar:=vecCombo[get(comboVar)])
    
    # Apply summary function at unique level
    dfMod <- dfMod %>%
        group_by_at(vars(all_of(uqVars))) %>%
        summarize(across(where(is.numeric), .fns=fn, ...), .groups="drop")
    
    # Return the modified data frame with the new records at the bottom
    dfKeep %>%
        select(names(dfMod)) %>%
        bind_rows(dfMod)
    
}

The code can then be tested on previously downloaded data:

# Test combination of functions to read file, rename fields, mutate variables, and confirm uniqueness
dfRaw_dc_210414 <- fileRead("./RInputFiles/Coronavirus/CDC_dc_downloaded_210414.csv") %>% 
    colRenamer(c('submission_date'='date')) %>% 
    colMutater(selfList=list('date'=lubridate::mdy), 
               fullList=list('naconf'=expression(is.na(conf_cases)))
               ) %>%
    checkUniqueRows(uniqueBy=c("state", "date"))
## 
## -- Column specification --------------------------------------------------------
## cols(
##   submission_date = col_character(),
##   state = col_character(),
##   tot_cases = col_double(),
##   conf_cases = col_double(),
##   prob_cases = col_double(),
##   new_case = col_double(),
##   pnew_case = col_double(),
##   tot_death = col_double(),
##   conf_death = col_double(),
##   prob_death = col_double(),
##   new_death = col_double(),
##   pnew_death = col_double(),
##   created_at = col_character(),
##   consent_cases = col_character(),
##   consent_deaths = col_character()
## )
## 
## *** File has been checked for uniqueness by: state date
dfRaw_dc_210414
## # A tibble: 26,820 x 16
##    date       state tot_cases conf_cases prob_cases new_case pnew_case tot_death
##    <date>     <chr>     <dbl>      <dbl>      <dbl>    <dbl>     <dbl>     <dbl>
##  1 2021-01-01 FL      1300528         NA         NA        0      6063     21673
##  2 2020-04-30 IA         7145         NA         NA      302         0       162
##  3 2020-02-26 UT            0         NA         NA        0        NA         0
##  4 2020-03-05 GA            2         NA         NA       -5        NA         0
##  5 2020-05-04 WV         1224       1224          0       29         0        50
##  6 2020-12-02 NY       346492         NA         NA     5775         0     10117
##  7 2020-05-05 GA        28196      28182         14      602         1      1258
##  8 2020-08-23 TN       143937     141000       2937     1854        38      1567
##  9 2020-05-16 AS            0         NA         NA        0         0         0
## 10 2020-09-08 PW            0         NA         NA        0         0         0
## # ... with 26,810 more rows, and 8 more variables: conf_death <dbl>,
## #   prob_death <dbl>, new_death <dbl>, pnew_death <dbl>, created_at <chr>,
## #   consent_cases <chr>, consent_deaths <chr>, naconf <lgl>
# Testing the checkControl functions
checkControl(dfRaw_dc_210414, printData=TRUE, na.rm=TRUE)
## # A tibble: 1 x 10
##   tot_cases conf_cases prob_cases new_case pnew_case tot_death conf_death
##       <dbl>      <dbl>      <dbl>    <dbl>     <dbl>     <dbl>      <dbl>
## 1    4.50e9 2067083706  230283007 30995592   3328225  96207328   51520967
## # ... with 3 more variables: prob_death <dbl>, new_death <dbl>,
## #   pnew_death <dbl>
checkControl(dfRaw_dc_210414, groupBy="state", useVars=c("new_case", "new_death"), printControls=FALSE)
## # A tibble: 120 x 3
##    state name      newValue
##    <chr> <chr>        <dbl>
##  1 AK    new_case     62547
##  2 AK    new_death      310
##  3 AL    new_case    512950
##  4 AL    new_death    10712
##  5 AR    new_case    332222
##  6 AR    new_death     5665
##  7 AS    new_case         0
##  8 AS    new_death        0
##  9 AZ    new_case    849611
## 10 AZ    new_death    17086
## # ... with 110 more rows

A function is written to create line plots for displaying control totals:

helperLinePlot <- function(df, 
                           x, 
                           y, 
                           groupColor=NULL,
                           facetVar=NULL, 
                           facetRows=NULL, 
                           facetScales="fixed", 
                           xLab="",
                           yLab="Summed Value", 
                           titleLab=paste0("Control totals by ", x)
                           ) {

    # FUNCTION ARGUMENTS
    # df: the data frame or tibble
    # x: the x variable
    # y: the y variable
    # groupColor: a variable to be used for grouping and coloring
    # facetVar: the faceting variable, NULL means do not facet
    # facetRows: the number of rows to include in faceting (NULL means allow facet_wrap to select)
    # facetScales: the scale for the facetting, can be "fixed" or "free" or "free_y" or "free_x"
    # xLab: label for the x-axis
    # yLab: label for the y-axis
    # titleLab: label for the title
    
    # Code to create the main plot
    p1 <- df %>% 
        ggplot(aes_string(x=x, y=y)) + 
        geom_line(if(!is.null(groupColor)) aes_string(group=groupColor, color=groupColor)) + 
        labs(title=titleLab, x=xLab, y=yLab)
    # Add faceting if not NULL
    if (!is.null(facetVar)) p1 <- p1 + facet_wrap(~get(facetVar), nrow=facetRows, scales=facetScales)
    print(p1)
    
}

dfRaw_dc_210414 %>%
    checkControl(groupBy="date", useVars=c("new_case", "new_death"), printControls=FALSE) %>%
    helperLinePlot(x="date", y="newValue", facetVar="name", facetScales="free_y")

dfRaw_dc_210414 %>%
    checkControl(groupBy="date", useVars=c("new_case", "new_death"), printControls=FALSE) %>%
    helperLinePlot(x="date", y="newValue", facetVar="name", facetScales="free_y", groupColor="name")

Additionally, it will be valuable to have functions for:

  • Difference in variable names in new or old file
  • Difference in field values (e.g., date or state) in new or old file
  • Difference in specified column sums to specified control levels over specified time periods

The function helperSimilarity allows for checking consistency of two sets:

# Helper function to check for similarity of key elements
helperSimilarity <- function(newData, 
                             refData, 
                             label, 
                             countOnly=FALSE, 
                             logFile=NULL, 
                             logAppend=TRUE, 
                             returnData=isTRUE(countOnly)
                             ) {
    
    # FUNCTION ARGUMENTS:
    # newData: a new data file
    # refData: a reference data file
    # label: a label for the check being run
    # countOnly: boolean, should only a count of differences be reported in the main log file?
    # logFile: external file for writing out detailed differences (NULL means do not write to external file)
    # logAppend: boolean, if the external log file exists should it be appended? (FALSE means overwrite)
    # returnData: should the differences data be returned (default is yes for countOnly, no otherwise

    # Find difference in set 1 and set 2    
    d1 <- setdiff(refData, newData)
    d2 <- setdiff(newData, refData)
    
    # Write the differences (counts or actual values) to the main log file
    cat("\n\nChecking for similarity of:", label)
    cat("\nIn reference but not in current:", if(countOnly) length(d1) else d1)
    cat("\nIn current but not in reference:", if(countOnly) length(d2) else d2)
    
    # If a logFile is provided and only counts are in the main file, write to the log file
    if (countOnly & !is.null(logFile)) {
        cat("\nDetailed differences available in:", logFile)
        capture.output(cat("\nDetailed differences for: ", label, "\n", sep=""), 
                       cat("\nIn reference but not in current:\n", paste(d1, collapse="\n"), sep=""), 
                       cat("\nIn current but not in reference:\n", paste(d2, collapse="\n"), sep=""), 
                       file=logFile, 
                       append=logAppend
                       )
    }

    # Return data if requested
    if (returnData) return(list(d1=d1, d2=d2))
    
}

A function is written to check variable names and values from any number of discrete columns:

checkSimilarity <- function(df, 
                            ref, 
                            keyVars=list(), 
                            writeLog=NULL, 
                            ovrwriteLog=TRUE
                            ) {
    
    # FUNCTION ARGUMENTS:
    # df: the new data frame
    # ref: the reference data frame
    # keyVars: the key variables to be tested, passed as a named list
    #          list('varName'=list(label='label', countOnly=boolean, externalLog=boolean, convChar=boolean))
    # writeLog: an external log file to be written to (NULL means main log file only)
    # ovrwriteLog: boolean, should the external log be overwritten?
    
    # Check for consistency of variable names (always output to main log file)
    helperSimilarity(newData=names(df), refData=names(ref), label="column names")
    
    # Create storage container for similarities
    changeList <- vector("list", length(keyVars))
    names(changeList) <- names(keyVars)
    
    # Check for consistency of variables in keyVars
    for (ctr in seq_along(keyVars)) {
        vrbl <- names(keyVars)[ctr]
        d1 <- df %>% pull(vrbl) %>% unique() %>% sort()
        d2 <- ref %>% pull(vrbl) %>% unique() %>% sort()
        lstData <- keyVars[[ctr]]
        changeList[[ctr]] <- helperSimilarity(if(isTRUE(lstData$convChar)) as.character(d1) else d1, 
                                              if(isTRUE(lstData$convChar)) as.character(d2) else d2, 
                                              label=lstData$label, 
                                              countOnly=lstData$countOnly, 
                                              logFile=if(isFALSE(lstData$externalLog)) NULL else writeLog, 
                                              logAppend=!ovrwriteLog, 
                                              returnData=TRUE
                                              )
    }

    # Return the list file of differences
    cat("\n\n")
    changeList
    
}

The function is applied to dfRaw_dc_210414 against itself:

checkSimilarity(dfRaw_dc_210414, 
                dfRaw_dc_210414, 
                keyVars=list(date=list(label='date', countOnly=TRUE), 
                             state=list(label='state', countOnly=FALSE)
                             )
                )
## 
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 0
## 
## Checking for similarity of: state
## In reference but not in current: 
## In current but not in reference:
## $date
## $date$d1
## numeric(0)
## 
## $date$d2
## numeric(0)
## 
## 
## $state
## $state$d1
## character(0)
## 
## $state$d2
## character(0)

New data are downloaded, allowing for QC to be more meaningful. The download process is cached to avoid multiple hits against the server:

fileDownload(fileName="./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv", 
             url="https://data.cdc.gov/api/views/9mfq-cb36/rows.csv?accessType=DOWNLOAD", 
             ovrWrite=FALSE
             )
##                                                           size isdir mode
## ./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv 2276460 FALSE  666
##                                                                      mtime
## ./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv 2021-05-02 09:47:39
##                                                                      ctime
## ./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv 2021-05-02 09:47:37
##                                                                      atime exe
## ./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv 2021-05-02 09:47:39  no

The read and initial QC functions are then run:

# Test combination of functions to read file, rename fields, mutate variables, and confirm uniqueness
dfRaw_dc_210502 <- fileRead("./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv") %>% 
    colRenamer(c('submission_date'='date')) %>% 
    colMutater(selfList=list('date'=lubridate::mdy)) %>%
    checkUniqueRows(uniqueBy=c("state", "date"))
## 
## -- Column specification --------------------------------------------------------
## cols(
##   submission_date = col_character(),
##   state = col_character(),
##   tot_cases = col_double(),
##   conf_cases = col_double(),
##   prob_cases = col_double(),
##   new_case = col_double(),
##   pnew_case = col_double(),
##   tot_death = col_double(),
##   conf_death = col_double(),
##   prob_death = col_double(),
##   new_death = col_double(),
##   pnew_death = col_double(),
##   created_at = col_character(),
##   consent_cases = col_character(),
##   consent_deaths = col_character()
## )
## 
## *** File has been checked for uniqueness by: state date
dfRaw_dc_210502
## # A tibble: 27,900 x 15
##    date       state tot_cases conf_cases prob_cases new_case pnew_case tot_death
##    <date>     <chr>     <dbl>      <dbl>      <dbl>    <dbl>     <dbl>     <dbl>
##  1 2021-04-01 RI       138406         NA         NA      428         0      2627
##  2 2020-10-15 DC        16166         NA         NA       34         0       638
##  3 2021-03-16 MN       498926         NA         NA      708       152      6817
##  4 2021-04-16 CT       329062     302775      26287     1062       208      7995
##  5 2020-02-14 DC            0         NA         NA        0        NA         0
##  6 2020-03-12 NJ           29         NA         NA        6        NA         1
##  7 2020-06-25 NE        18346         NA         NA      125         0       260
##  8 2020-02-24 CA           10         NA         NA        0        NA         0
##  9 2020-11-14 VA       201961     183455      18506     1161       191      3800
## 10 2020-11-17 DE        29755      28473       1282      203        11       742
## # ... with 27,890 more rows, and 7 more variables: conf_death <dbl>,
## #   prob_death <dbl>, new_death <dbl>, pnew_death <dbl>, created_at <chr>,
## #   consent_cases <chr>, consent_deaths <chr>
# Basic line plot for new cases and new deaths by date
dfRaw_dc_210502 %>%
    checkControl(groupBy="date", useVars=c("new_case", "new_death"), printControls=FALSE) %>%
    helperLinePlot(x="date", y="newValue", facetVar="name", facetScales="free_y", groupColor="name")

# Confirm same universe of variable names, states, and dates
diff_dc_210502_20414 <- checkSimilarity(df=dfRaw_dc_210502, 
                                        ref=dfRaw_dc_210414, 
                                        keyVars=list(date=list(label='date', countOnly=TRUE, convChar=TRUE), 
                                                     state=list(label='state', countOnly=FALSE)
                                                     )
                                        )
## 
## 
## Checking for similarity of: column names
## In reference but not in current: naconf
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 18
## 
## Checking for similarity of: state
## In reference but not in current: 
## In current but not in reference:
diff_dc_210502_20414
## $date
## $date$d1
## character(0)
## 
## $date$d2
##  [1] "2021-04-13" "2021-04-14" "2021-04-15" "2021-04-16" "2021-04-17"
##  [6] "2021-04-18" "2021-04-19" "2021-04-20" "2021-04-21" "2021-04-22"
## [11] "2021-04-23" "2021-04-24" "2021-04-25" "2021-04-26" "2021-04-27"
## [16] "2021-04-28" "2021-04-29" "2021-04-30"
## 
## 
## $state
## $state$d1
## character(0)
## 
## $state$d2
## character(0)

The functions appear to be chaining and working correctly. Next steps are to plot outputs of checkSimilarity, and to add capability to show differences in actual/cumulative values by user-specified grouping variables.

A function is written to take a differences list and plot:

plotSimilarity <- function(lst, 
                           plotItems=NULL, 
                           nameMap=c("d1"="old file only", "d2"="new file only")
                           ) {
    
    # FUNCTION ARGUMENTS:
    # lst: a named list that includes setdiff() results in two directions
    # plotItems: character vector of the named list items to process (NULL means all)
    # nameMap: map from the list sub-component names to a descriptive name for plotting
    
    # If plotItems is NULL, plot everything 
    if (is.null(plotItems)) plotItems <- names(lst)
    
    # Loop through and plot
    for (vrbl in plotItems) {
        p1 <- lapply(lst[[vrbl]], FUN=function(x) tibble::tibble(value=x)) %>%
            bind_rows(.id="src") %>%
            mutate(src=factor(unname(nameMap[src]), levels=unname(nameMap))) %>%
            arrange(value, src) %>%
            ggplot(aes(x=value, y=src)) + 
            geom_point(aes(color=src), size=4) + 
            labs(x=vrbl, y="Mismatch cause", title=paste0("Values for ", vrbl, " only in one file")) + 
            coord_flip() + 
            scale_color_discrete("Mismatch\ncause")
        print(p1)
    }
    
}

The function is then run for date:

plotSimilarity(diff_dc_210502_20414, plotItems=c("date"))

As expected, the new file contains a greater universe of dates and the same universe of states.

A further capability is added to compare data aggregated to a specific level acros files:

compareAggregate <- function(df, 
                             ref, 
                             grpVar, 
                             numVars, 
                             sameUniverse=NA,
                             plotData=FALSE, 
                             isLine=TRUE, 
                             returnDelta=FALSE
                             ) {
    
    # FUNCTION ARGUMENTS:
    # df: the latest data frame or tibble
    # ref: the reference data frame or tibble
    # grpVar: character vector of the level for aggregation
    # numVars: character vector of the numeric vectors to explore
    # sameUniverse: character vector of variables where the files should be required to have the same universe
    #               NA means this is not enforced, helps limit to same date range when checking state
    # plotData: boolean, should the data be plotted?
    # isLine: boolean, should the plot be drawn as a line graph (FALSE means point graph)
    # returnDelta: boolean, should the delta aggregate file be returned?
    
    # Get the matching universe if requested
    if (!is.na(sameUniverse)) {
        univData <- df %>% select_at(vars(all_of(sameUniverse))) %>%
            bind_rows(select_at(ref, vars(all_of(sameUniverse))), .id="src") %>%
            count(date, src) %>%
            pivot_wider(names_from="src", values_from="n") %>%
            filter(complete.cases(.)) %>%
            select_at(vars(all_of(sameUniverse)))
        # Filter df and ref such that they only include matches
        df <- univData %>% left_join(df, by=names(univData))
        ref <- univData %>% left_join(ref, by=names(univData))
    }
    
    # Create a tibble of the aggregated data from df and ref
    dfAgg <- df %>%
        checkControl(groupBy=grpVar, useVars=numVars, printControls=FALSE, na.rm=TRUE)
    refAgg <- ref %>%
        checkControl(groupBy=grpVar, useVars=numVars, printControls=FALSE, na.rm=TRUE) %>%
        colRenamer(vecRename=c("newValue"="refValue"))
    deltaAgg <- full_join(dfAgg, refAgg, by=c(grpVar, "name")) %>%
        tibble::as_tibble()
    
    # Create a plot if requested
    if (plotData) {
        p1 <- deltaAgg %>%
            pivot_longer(c(newValue, refValue), names_to="src") %>%
            filter(!is.na(value)) %>%
            ggplot() + 
            labs(x="", y="", title=paste0("Aggregated data by ", grpVar, " across new and reference file")) + 
            scale_color_discrete("Source")
        if (isTRUE(isLine)) {
            p1 <- p1 + 
                geom_line(aes_string(x=grpVar, y="value", group="src", color="src")) + 
                facet_wrap(~name, scales="free_y")
        } else {
            p1 <- p1 + 
                geom_point(aes(x=fct_reorder(get(grpVar), value), y=value, color=src)) + 
                coord_flip() +
                facet_wrap(~name, scales="free_x", nrow=1)
        }
        if (!is.na(sameUniverse)) {
            p1 <- p1 + labs(subtitle=paste0("Data filtered to same universe on: ", sameUniverse))
        }
        print(p1)
    }
    
    # Return the aggregate data if requested
    if(isTRUE(returnDelta)) return(deltaAgg)
    
}

The function is then run for totals by date and state (superseded below to include printouts):

# Compare data by date
compareAggregate(df=dfRaw_dc_210502, 
                 ref=dfRaw_dc_210414, 
                 grpVar="date", 
                 numVars=c("new_case", "new_death", "tot_cases", "tot_death"), 
                 plotData=TRUE
                 )

# Compare data by state, using total universe
compareAggregate(df=dfRaw_dc_210502, 
                 ref=dfRaw_dc_210414, 
                 grpVar="state", 
                 numVars=c("new_case", "new_death"), 
                 plotData=TRUE, 
                 isLine=FALSE
                 )

# Compare data by state, using only matching values
compareAggregate(df=dfRaw_dc_210502, 
                 ref=dfRaw_dc_210414, 
                 grpVar="state", 
                 sameUniverse="date",
                 numVars=c("new_case", "new_death", "tot_cases", "tot_death"), 
                 plotData=TRUE, 
                 isLine=FALSE
                 )

A function is written to flag instances where two columns differ by more than a specified tolerance:

flagLargeDelta <- function(df, 
                           col1="newValue", 
                           col2="refValue", 
                           pctTol=0.01, 
                           absTol=5, 
                           sortBy=c("pctDelta", "absDelta"),
                           dropNA=TRUE,
                           printAll=FALSE
                           ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame or tibble
    # col1: the name of the first column being compared
    # col2: the name of the second column being compared
    # pctTol: values will be flagged as 'high percent' if abs(col1-col2) > pctTol*mean(col1, col2)
    #         setting pctTol to 0 means only absTol is relevant
    # absTol: values will be flagged as 'high volume' if abs(col1-col2) > absTol
    #         setting absTol to 0 means only pctTol is relevant
    #         if absTol==0 and pctTol==0 then any difference in col1 and col2 will be flagged
    # sortBy: final output should be sorted by descending of these variables (NULL means no sorting)
    # dropNA: boolean, should cases where col1 or col2 is NA be excluded from reporting?
    # printAll: print every record (as.data.frame) rather than just the top 10 (tibble)
    
    # Create val1 and val2 from col1 and col2
    tol <- df %>%
        colRenamer(vecRename=purrr::set_names(c("val1", "val2"), c(col1, col2))) 
    
    # Exclude NA if requested
    if (isTRUE(dropNA)) tol <- tol %>% filter(!is.na(val1), !is.na(val2))
    
    # Create absDelta and pctDelta and filter accordingly
    tol <- tol %>%
        mutate(use1=ifelse(is.na(val1), 0, val1), 
               use2=ifelse(is.na(val2), 0, val2), 
               absDelta=abs(use1-use2), 
               pctDelta=2*absDelta/(use1+use2)
               ) %>%
        select(-use1, -use2) %>%
        filter(absDelta > absTol, pctDelta > pctTol) %>%
        colRenamer(vecRename=purrr::set_names(c(col1, col2), c("val1", "val2")))
    
    # Sort the columns if requested
    if (!is.null(sortBy)) {
        tol <- tol %>%
            arrange(across(.cols=all_of(sortBy), .fns=desc))
    }
    
    # Convert to data frame if requested to print all
    if (isTRUE(printAll)) tol <- as.data.frame(tol)
    
    # Describe the upcoming report
    cat("\n\n***Differences of at least ", absTol, " and at least ", round(100*pctTol, 3), "%\n\n", sep="")
    
    # Return the data
    tol
    
}

The compareAggregate and flagLargeDelta functions are chained to allow for both plots and printouts:

# Compare data by date
compareAggregate(df=dfRaw_dc_210502, 
                 ref=dfRaw_dc_210414, 
                 grpVar="date", 
                 numVars=c("new_case", "new_death", "tot_cases", "tot_death"), 
                 plotData=TRUE, 
                 returnDelta=TRUE
                 ) %>%
    flagLargeDelta(pctTol=0.05, printAll=TRUE, sortBy=c("name", "pctDelta", "absDelta"))

## 
## 
## ***Differences of at least 5 and at least 5%
##          date      name newValue refValue absDelta   pctDelta
## 1  2020-01-23 tot_cases       47        1       46 1.91666667
## 2  2020-01-22 tot_cases       46        1       45 1.91489362
## 3  2020-01-25 tot_cases       51        2       49 1.84905660
## 4  2020-01-24 tot_cases       48        2       46 1.84000000
## 5  2020-01-30 tot_cases       59        5       54 1.68750000
## 6  2020-01-29 tot_cases       57        5       52 1.67741935
## 7  2020-01-28 tot_cases       56        5       51 1.67213115
## 8  2020-01-27 tot_cases       55        5       50 1.66666667
## 9  2020-01-26 tot_cases       54        5       49 1.66101695
## 10 2020-01-31 tot_cases       63        7       56 1.60000000
## 11 2020-02-02 tot_cases       67        8       59 1.57333333
## 12 2020-02-01 tot_cases       66        8       58 1.56756757
## 13 2020-02-05 tot_cases       76       11       65 1.49425287
## 14 2020-02-06 tot_cases       76       11       65 1.49425287
## 15 2020-02-04 tot_cases       75       11       64 1.48837209
## 16 2020-02-10 tot_cases       81       12       69 1.48387097
## 17 2020-02-03 tot_cases       74       11       63 1.48235294
## 18 2020-02-09 tot_cases       80       12       68 1.47826087
## 19 2020-02-15 tot_cases       91       14       77 1.46666667
## 20 2020-02-16 tot_cases       91       14       77 1.46666667
## 21 2020-02-08 tot_cases       78       12       66 1.46666667
## 22 2020-02-12 tot_cases       84       13       71 1.46391753
## 23 2020-02-07 tot_cases       77       12       65 1.46067416
## 24 2020-02-11 tot_cases       83       13       70 1.45833333
## 25 2020-02-14 tot_cases       89       14       75 1.45631068
## 26 2020-02-13 tot_cases       86       14       72 1.44000000
## 27 2020-02-17 tot_cases       97       16       81 1.43362832
## 28 2020-02-18 tot_cases       97       16       81 1.43362832
## 29 2020-02-19 tot_cases       98       17       81 1.40869565
## 30 2020-02-20 tot_cases      101       18       83 1.39495798
## 31 2020-02-21 tot_cases      111       23       88 1.31343284
## 32 2020-02-24 tot_cases      118       25       93 1.30069930
## 33 2020-02-23 tot_cases      114       25       89 1.28057554
## 34 2020-02-22 tot_cases      113       25       88 1.27536232
## 35 2020-02-25 tot_cases      123       29       94 1.23684211
## 36 2020-02-26 tot_cases      127       31       96 1.21518987
## 37 2020-02-27 tot_cases      138       35      103 1.19075145
## 38 2020-02-28 tot_cases      146       39      107 1.15675676
## 39 2020-02-29 tot_cases      165       54      111 1.01369863
## 40 2020-03-01 tot_cases      189       63      126 1.00000000
## 41 2020-03-02 tot_cases      230       95      135 0.83076923
## 42 2020-03-03 tot_cases      268      123      145 0.74168798
## 43 2020-03-04 tot_cases      306      152      154 0.67248908
## 44 2020-03-05 tot_cases      385      222      163 0.53706755
## 45 2020-03-06 tot_cases      451      279      172 0.47123288
## 46 2020-03-07 tot_cases      542      357      185 0.41156841
## 47 2020-03-08 tot_cases      722      521      201 0.32341110
## 48 2020-03-09 tot_cases      988      759      229 0.26216371
## 49 2020-03-10 tot_cases     1337     1084      253 0.20900454
## 50 2020-03-11 tot_cases     1682     1398      284 0.18441558
## 51 2020-03-14 tot_cases     3086     2606      480 0.16865777
## 52 2020-03-13 tot_cases     2586     2194      392 0.16401674
## 53 2020-03-12 tot_cases     2180     1857      323 0.16001982
## 54 2020-03-16 tot_cases     5372     4705      667 0.13238067
## 55 2020-03-15 tot_cases     4406     3872      534 0.12901667
## 56 2020-03-17 tot_cases     8424     7603      821 0.10245211
## 57 2020-03-18 tot_cases    12100    11098     1002 0.08638676
## 58 2020-03-19 tot_cases    17089    15934     1155 0.06995125
## 59 2020-03-20 tot_cases    20729    19375     1354 0.06752444
## 60 2020-03-21 tot_cases    26741    25233     1508 0.05802901
## 61 2020-09-07 new_death      268      232       36 0.14400000
## 62 2021-02-10 new_death     3193     3455      262 0.07882070
## 63 2021-02-03 new_death     3330     3583      253 0.07319543
## 64 2020-10-05 new_death      473      445       28 0.06100218
## 65 2021-01-27 new_death     3706     3922      216 0.05663346
## 66 2020-07-19 new_death      745      704       41 0.05659075
## 67 2020-09-13 new_death      554      524       30 0.05565863
## 68 2020-11-11 new_death     1013     1068       55 0.05285920
## 69 2021-02-02 new_death     3180     3351      171 0.05236564
## 70 2021-04-02 new_death      771      812       41 0.05180038
## 71 2020-10-04 new_death      438      416       22 0.05152225
## 72 2020-01-22  new_case       46        1       45 1.91489362
## 73 2020-02-27  new_case       11        4        7 0.93333333
## 74 2020-03-01  new_case       24        9       15 0.90909091
## 75 2020-03-03  new_case       38       28       10 0.30303030
## 76 2020-03-04  new_case       38       29        9 0.26865672
## 77 2020-03-02  new_case       41       32        9 0.24657534
## 78 2020-03-14  new_case      500      412       88 0.19298246
## 79 2020-03-13  new_case      406      337       69 0.18573351
## 80 2020-03-07  new_case       91       78       13 0.15384615
## 81 2020-03-16  new_case      966      833      133 0.14785992
## 82 2020-03-06  new_case       66       57        9 0.14634146
## 83 2020-03-05  new_case       79       70        9 0.12080537
## 84 2020-03-09  new_case      266      238       28 0.11111111
## 85 2020-09-08  new_case    26186    23701     2485 0.09962515
## 86 2021-04-11  new_case    54381    49409     4972 0.09580884
## 87 2020-03-11  new_case      345      314       31 0.09408194
## 88 2020-03-08  new_case      180      164       16 0.09302326
## 89 2020-03-12  new_case      498      459       39 0.08150470
## 90 2021-04-12  new_case    57236    61526     4290 0.07224533
## 91 2020-03-10  new_case      349      325       24 0.07121662
## 92 2020-07-06  new_case    50177    46972     3205 0.06598112
## 93 2020-03-20  new_case     3640     3441      199 0.05620675
## 94 2020-03-17  new_case     3052     2898      154 0.05176471
## 95 2020-08-24  new_case    35040    33287     1753 0.05131207
## 96 2020-09-12  new_case    39698    41758     2060 0.05057945
## 97 2020-03-18  new_case     3676     3495      181 0.05048110
# Compare data by state, using total universe
compareAggregate(df=dfRaw_dc_210502, 
                 ref=dfRaw_dc_210414, 
                 grpVar="state", 
                 numVars=c("new_case", "new_death"), 
                 plotData=TRUE, 
                 isLine=FALSE, 
                 returnDelta=FALSE
                 )

# Compare data by state, using only matching values
compareAggregate(df=dfRaw_dc_210502, 
                 ref=dfRaw_dc_210414, 
                 grpVar="state", 
                 sameUniverse="date",
                 numVars=c("new_case", "new_death", "tot_cases", "tot_death"), 
                 plotData=TRUE, 
                 isLine=FALSE, 
                 returnDelta=TRUE
                 ) %>%
    flagLargeDelta(pctTol=0.001, absTol=0, printAll=TRUE, sortBy=c("name", "pctDelta", "absDelta"))

## 
## 
## ***Differences of at least 0 and at least 0.1%
##    state      name newValue refValue absDelta    pctDelta
## 1     AL tot_death  1784530  1415458   369072 0.230670865
## 2     AK tot_death    52934    42347    10587 0.222226887
## 3     SC tot_death  1482611  1479011     3600 0.002431100
## 4     AL tot_cases 84345774 80518977  3826797 0.046423471
## 5     MO tot_cases 88597603 87112443  1485160 0.016904668
## 6     AK tot_cases  8345553  8302618    42935 0.005157924
## 7     AK new_death      341      310       31 0.095238095
## 8     AL new_death    10832    10712      120 0.011139993
## 9     SC new_death     9370     9279       91 0.009759236
## 10    RI new_death     2644     2638        6 0.002271867
## 11    AL  new_case   522889   512950     9939 0.019190241
## 12    MO  new_case   574786   584874    10088 0.017398203
## 13    RI  new_case   142859   141097     1762 0.012410373
## 14    SC  new_case   564337   563304     1033 0.001832143

The data sources appear to be well aligned. There are some meaningful percentage differences by date particularly for older data. But, aggregates by state over the time period available in each dataset are largely very close to each other. The largest restatements were for AL and AK, with the AL differences having been evident on the plot also.

The functions can then be combined for a full run of data processing:

# Mapping for urlType to url
urlMapper <- c("cdcDaily"="https://data.cdc.gov/api/views/9mfq-cb36/rows.csv?accessType=DOWNLOAD", 
               "cdcHosp"="https://beta.healthdata.gov/api/views/g62h-syeh/rows.csv?accessType=DOWNLOAD"
               )

# Mapping for urlType to colRenamer(vecRename=...)
renMapper <- list("cdcDaily"=c('submission_date'='date', 'new_case'='new_cases', 
                               'tot_death'='tot_deaths', 'new_death'='new_deaths'
                               ), 
                  "cdcHosp"=c("inpatient_beds_used_covid"="inp", 
                              "total_adult_patients_hospitalized_confirmed_and_suspected_covid"="hosp_adult", 
                              "total_pediatric_patients_hospitalized_confirmed_and_suspected_covid"="hosp_ped"
                              ), 
                  "default"=c()
                  )

# Mapping for urlType to colMutater(selfList=...)
selfListMapper <- list("cdcDaily"=list('date'=lubridate::mdy), 
                       "cdcHosp"=list(), 
                       "default"=list()
                       )

# Mapping for urlType to colMutater(fullList=...)
fullListMapper <- list("cdcDaily"=list(), 
                       "cdcHosp"=list(), 
                       "default"=list()
                       )

# Mapping for urlType to checkUniqueRows(uniqueBy=...)
uqMapper <- list("cdcDaily"=c("state", "date"), 
                 "cdcHosp"=c("state", "date")
                 )

# Mapping for urlType to checkControls(...)
checkControlGroupMapper <- list("cdcDaily"="date",
                                "cdcHosp"="date",
                                "default"=c()
                                )
checkControlVarsMapper <- list("cdcDaily"=c("new_cases", "new_deaths"),
                               "cdcHosp"=c("inp", "hosp_adult", "hosp_ped")
                               )

# Mapping for urlType to checkSimilarity(..., keyVars=)
checkSimilarityMapper <- list("cdcDaily"=list(date=list(label='date', countOnly=TRUE, convChar=TRUE), 
                                              state=list(label='state', countOnly=FALSE)
                                              ), 
                              "cdcHosp"=list(date=list(label='date', countOnly=TRUE, convChar=TRUE), 
                                              state=list(label='state', countOnly=FALSE)
                                              ), 
                              "default"=list()
                              )

# Mapping for urlType to plotSimilarity(..., )
plotSimilarityMapper <- list("cdcDaily"=c("date"), 
                             "cdcHosp"=c("date"), 
                             "default"=c()
                             )

# Mapping file for aggregates
keyAggMapper <- list("cdcDaily"=list("l1"=list("grpVar"="date",
                                               "numVars"=c("new_cases", "new_deaths",
                                                           "tot_cases", "tot_deaths"
                                                           ),
                                               "sameUniverse"=NA,
                                               "plotData"=TRUE,
                                               "isLine"=TRUE,
                                               "returnDelta"=TRUE,
                                               "flagLargeDelta"=TRUE,
                                               "pctTol"=0.05,
                                               "absTol"=5,
                                               "sortBy"=c("name", "pctDelta", "absDelta"),
                                               "dropNA"=TRUE,
                                               "printAll"=TRUE
                                               ),
                                     "l2"=list("grpVar"="state",
                                               "numVars"=c("new_cases", "new_deaths"),
                                               "sameUniverse"=NA,
                                               "plotData"=TRUE,
                                               "isLine"=FALSE,
                                               "returnDelta"=FALSE,
                                               "flagLargeDelta"=FALSE
                                               ),
                                     "l3"=list("grpVar"="state",
                                               "numVars"=c("new_cases", "new_deaths",
                                                           "tot_cases", "tot_deaths"
                                                           ),
                                               "sameUniverse"="date",
                                               "plotData"=TRUE,
                                               "isLine"=FALSE,
                                               "returnDelta"=TRUE,
                                               "flagLargeDelta"=TRUE,
                                               "pctTol"=0.001,
                                               "absTol"=0,
                                               "sortBy"=c("name", "pctDelta", "absDelta"),
                                               "dropNA"=TRUE,
                                               "printAll"=TRUE
                                               )
                                     ),
                     "cdcHosp"=list("l1"=list("grpVar"="date",
                                              "numVars"=c("inp", "hosp_adult", "hosp_ped"),
                                              "sameUniverse"=NA,
                                              "plotData"=TRUE,
                                              "isLine"=TRUE,
                                              "returnDelta"=TRUE,
                                              "flagLargeDelta"=TRUE,
                                              "pctTol"=0.05,
                                              "absTol"=5,
                                              "sortBy"=c("name", "pctDelta", "absDelta"),
                                              "dropNA"=TRUE,
                                              "printAll"=TRUE
                                              ),
                                    "l2"=list("grpVar"="state",
                                              "numVars"=c("inp", "hosp_adult", "hosp_ped"),
                                              "sameUniverse"=NA,
                                              "plotData"=TRUE,
                                              "isLine"=FALSE,
                                              "returnDelta"=FALSE,
                                              "flagLargeDelta"=FALSE
                                              ),
                                    "l3"=list("grpVar"="state",
                                              "numVars"=c("inp", "hosp_adult", "hosp_ped"),
                                              "sameUniverse"="date",
                                              "plotData"=TRUE,
                                              "isLine"=FALSE,
                                              "returnDelta"=TRUE,
                                              "flagLargeDelta"=TRUE,
                                              "pctTol"=0.001,
                                              "absTol"=0,
                                              "sortBy"=c("name", "pctDelta", "absDelta"),
                                              "dropNA"=TRUE,
                                              "printAll"=TRUE
                                              )
                                    )
                     )

# Mapping list for vector selection in processed data
vecSelectMapper <- list("cdcDaily"=c("date", "state", "tot_cases", "tot_deaths", "new_cases", "new_deaths"), 
                        "cdcHosp"=c("date", "state", "inp", "hosp_adult", "hosp_ped")
                        )

# Mapping list for lstFilter
lstFilterMapper <- list("cdcDaily"=list("state"=c(state.abb, "DC")), 
                        "cdcHosp"=list("state"=c(state.abb, "DC"))
                        )

# Mapping list for combining data element
lstComboMapper <- list("cdcDaily"=list("nyc"=list("comboVar"="state", 
                                                  "uqVars"="date", 
                                                  "vecCombo"=c("NY"="NY", "NYC"="NY"),
                                                  "fn"=specNASum
                                                  )
                                       ), 
                       "cdcHosp"=list()
                       )

# Mapping file for creating per-capita metrics
perCapMapper <- c("tot_cases"="tcpm", 
                  "tot_deaths"="tdpm", 
                  "new_cases"="cpm", 
                  "new_deaths"="dpm", 
                  "inp"="hpm", 
                  "hosp_adult"="ahpm", 
                  "hosp_ped"="phpm"
                  )


# List of global items for keyAggMapper (can also set flagLargeDelta to equal returnDelta)
# keyAggGlobal <- list("plotData"=TRUE, 
#                      "sortBy"=c("name", "pctDelta", "absDelta"),
#                      "dropNA"=TRUE,
#                      "printAll"=TRUE
#                      )

# Create the elements that are unique to each item
# keyAggMapper <- list("cdcDaily"=list("l1"=list("grpVar"="date",
#                                                "numVars"=c("new_cases", "new_deaths",
#                                                            "tot_cases", "tot_deaths"
#                                                            ),
#                                                "sameUniverse"=NA,
#                                                "isLine"=TRUE,
#                                                "returnDelta"=TRUE,
#                                                "pctTol"=0.05,
#                                                "absTol"=5
#                                                ),
#                                      "l2"=list("grpVar"="state",
#                                                "numVars"=c("new_cases", "new_deaths"),
#                                                "sameUniverse"=NA,
#                                                "isLine"=FALSE,
#                                                "returnDelta"=FALSE
#                                                ),
#                                      "l3"=list("grpVar"="state",
#                                                "numVars"=c("new_cases", "new_deaths",
#                                                            "tot_cases", "tot_deaths"
#                                                            ),
#                                                "sameUniverse"="date",
#                                                "isLine"=FALSE,
#                                                "returnDelta"=TRUE,
#                                                "pctTol"=0.001,
#                                                "absTol"=0
#                                                )
#                                      ),
#                      "cdcHosp"=list("l1"=list("grpVar"="date",
#                                               "numVars"=c("inp", "hosp_adult", "hosp_ped"),
#                                               "sameUniverse"=NA,
#                                               "isLine"=TRUE,
#                                               "returnDelta"=TRUE,
#                                               "pctTol"=0.05,
#                                               "absTol"=5
#                                               ),
#                                     "l2"=list("grpVar"="state",
#                                               "numVars"=c("inp", "hosp_adult", "hosp_ped"),
#                                               "sameUniverse"=NA,
#                                               "isLine"=FALSE,
#                                               "returnDelta"=FALSE
#                                               ),
#                                     "l3"=list("grpVar"="state",
#                                               "numVars"=c("inp", "hosp_adult", "hosp_ped"),
#                                               "sameUniverse"="date",
#                                               "isLine"=FALSE,
#                                               "returnDelta"=TRUE,
#                                               "pctTol"=0.001,
#                                               "absTol"=0
#                                               )
#                                     )
#                      )

# Add global variables to keyAggMapper
# keyAggFunc <- function(x, y=keyAggGlobal) lapply(names(y), FUN=function(z) {x[[z]] <- y[[z]]; x})
# keyAggMapper <- lapply(keyAggMapper, FUN=function(a) lapply(a, FUN=function(b) {lapply(b, FUN=keyAggFunc)}))



# Function to allow for printing to a log file
printLog <- function(x, 
                     txt="",
                     writeLog=NULL, 
                     appendLog=TRUE
                     ) {
    
    # FUNCTION ARGUMENTS
    # x: an object to be printed
    # txt: a descriptor to include for the data in the log file or otherwise
    # writeLog: the external file location for printing (NULL means use the main log stdout)
    # appendLog: for an external log, should the file be appended rather than overwritten?
    
    if (is.null(writeLog)) {
        cat("\n", txt, "\n", sep="")
        print(x)
    }
    else {
        cat("\nDetailed output available in log:", writeLog)
        capture.output(cat("\n\n", txt, "\n", sep=""), print(x), file=writeLog, append=appendLog)
        }
}


# Function to read and check a raw data file
readQCRawFile <- function(fileName, 
                          writeLog=NULL,
                          ovrwriteLog=TRUE,
                          dfRef=NULL,
                          urlType=NULL,
                          url=NULL, 
                          getData=TRUE,
                          ovrWriteDownload=FALSE, 
                          vecRename=NULL, 
                          selfList=NULL,
                          fullList=NULL,
                          uniqueBy=NULL, 
                          step3Group=NULL,
                          step3Vals=NULL, 
                          step4KeyVars=NULL, 
                          step5PlotItems=NULL,
                          step6AggregateList=NULL,
                          inferVars=list("url"=urlMapper, 
                                         "vecRename"=renMapper, 
                                         "selfList"=selfListMapper, 
                                         "fullList"=fullListMapper, 
                                         "uniqueBy"=uqMapper, 
                                         "step3Group"=checkControlGroupMapper,
                                         "step3Vals"=checkControlVarsMapper, 
                                         "step4KeyVars"=checkSimilarityMapper, 
                                         "step5PlotItems"=plotSimilarityMapper,
                                         "step6AggregateList"=keyAggMapper
                                         )
                          ) {
    
    # FUNCTION ARGUMENTS
    # fileName: the location where downloaded data either is, or will be, stored
    # writeLog: the external file location for printing (NULL means use the main log stdout)
    # ovrwriteLog: boolean, if using an external log, should it be started from scratch (overwritten)?
    # dfRef: a reference data frame for comparison (either NULL or NA means do not run comparisons)
    # urlType: character vector that can be mapped using urlMapper and keyVarMapper
    # url: direct URL passed as character string
    #      NOTE that if both url and urlType are NULL, no file will be downloaded
    # getData: boolean, should an attempt be made to get new data using urlType or url?
    # ovrWriteDownload: boolean, if fileName already exists, should it be overwritten?
    # vecRename: vector for renaming c('existing name'='new name'), can be any length from 0 to ncol(df)
    #            NULL means infer from urlType, if not available there use c()
    # selfList: list for functions to apply to self, list('variable'=fn) will apply variable=fn(variable)
    #           processed in order, so more than one function can be applied to self
    #           NULL means infer from urlType, if not available in mapping file use list()
    # fullList: list for general functions to be applied, list('new variable'=expression(code))
    #           will create 'new variable' as eval(expression(code))
    #           for now, requires passing an expression
    #           NULL means infer from urlType, use list() if not in mapping file
    # uniqueBy: combination of variables for checking uniqueness
    #           NULL means infer from data, keep as NULL (meaning use-all) if cannot be inferred
    # step3Group: variable to be used as the x-axis (grouping) for step 3 plots
    #             NULL means infer from data
    # step3Vals: values to be plotted on the y-axis for step 3 plots
    #            NULL means infer from data
    # step4KeyVars: list of parameters to be passed as keyVars= in step 4
    #               NULL means infer from urlType
    # step5PlotItems: items to be plotted in step 5
    #                 NULL means infer from urlType
    # step6AggregateList: drives the elements to be passed to compareAggregate() and flagLargeDelta()
    #                     NULL means infer from urlType
    # inferVars: vector of c('variable'='mapper') for inferring parameter values when passed as NULL
    
    # Step 0a: Use urlType to infer key variables if passed as NULL
    for (vrbl in names(inferVars)) {
        mapper <- inferVars[[vrbl]]
        if (is.null(get(vrbl))) {
            if (urlType %in% names(mapper)) assign(vrbl, mapper[[urlType]])
            else if ("default" %in% names(mapper)) assign(vrbl, mapper[["default"]])
        }
    }

    # Step 0b: If an external log file is passed, add a note and take care of appending
    if (!is.null(writeLog)) {
        txt <- paste0("\n\n*** Writing log at: ", Sys.time() %>% lubridate::with_tz(tzone="UTC"), "Z ***\n\n")
        capture.output(cat(txt), file=writeLog, append=isFALSE(ovrwriteLog))
    }
    
    # Step 1: Download a new file (if requested)
    if (!is.null(url) & isTRUE(getData)) fileDownload(fileName=fileName, url=url, ovrWrite=ovrWriteDownload)
    else cat("\nNo file has been downloaded, will use existing file:", fileName, "\n")
    
    # Step 2: Read file, rename and mutate variables, confirm uniqueness by expected levels
    dfRaw <- fileRead(fileName) %>% 
        colRenamer(vecRename) %>% 
        colMutater(selfList=selfList, fullList=fullList) %>%
        checkUniqueRows(uniqueBy=uniqueBy)

    # Step 3: Plot basic control totals for new cases and new deaths by month
    dfRaw %>%
        checkControl(groupBy=step3Group, useVars=step3Vals, printControls=FALSE, na.rm=TRUE) %>%
        helperLinePlot(x=step3Group, y="newValue", facetVar="name", facetScales="free_y", groupColor="name")

    # If there is no file for comparison, return the data
    if (is.null(dfRef) | if(length(dfRef)==1) is.na(dfRef) else FALSE) return(dfRaw)
    
    # Step 4b: Check similarity of existing and reference file
    # ovrWriteLog=FALSE since everything should be an append after the opening text line in step 0
    diffRaw <- checkSimilarity(df=dfRaw, ref=dfRef, keyVars=step4KeyVars, writeLog=writeLog, ovrwriteLog=FALSE)

    # Step 5: Plot the similarity checks
    plotSimilarity(diffRaw, plotItems=step5PlotItems)

    # Step 6: Plot and report on differences in aggregates
    helperAggMap <- function(x) {
        h1 <- compareAggregate(df=dfRaw, ref=dfRef, grpVar=x$grpVar, numVars=x$numVars, 
                               sameUniverse=x$sameUniverse, plotData=x$plotData, isLine=x$isLine, 
                               returnDelta=x$returnDelta)
        if (isTRUE(x$flagLargeDelta)) {
            h2 <- flagLargeDelta(h1, pctTol=x$pctTol, absTol=x$absTol, sortBy=x$sortBy, 
                                 dropNA=x$dropNA, printAll=x$printAll
                                 )
            if (is.null(writeLog)) print(h2)
            else {
                cat(nrow(h2), " records", sep="")
                txt <- paste0("\n\n***Differences of at least ", 
                              x$absTol, 
                              " and at least ", 
                              round(100*x$pctTol, 3), "%\n\n"
                              )
                printLog(h2, txt=txt, writeLog=writeLog)
            }
        }
    }
    lapply(step6AggregateList, FUN=helperAggMap)

    cat("\n\n")
    
    # Return the raw data file
    dfRaw

}

The function is tested on previously downloaded data:

# Note that output data will include NA and has not yet mapped NYC and NY together
dfCaseDeath <- readQCRawFile(fileName="./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv", 
                             writeLog="./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log",
                             urlType="cdcDaily",
                             getData=FALSE,
                             dfRef=colRenamer(dfRaw_dc_210414, c('new_case'='new_cases', 
                                                                 'tot_death'='tot_deaths',
                                                                 'new_death'='new_deaths'
                                                                 )
                                              )
                             )
## 
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv
## 
## -- Column specification --------------------------------------------------------
## cols(
##   submission_date = col_character(),
##   state = col_character(),
##   tot_cases = col_double(),
##   conf_cases = col_double(),
##   prob_cases = col_double(),
##   new_case = col_double(),
##   pnew_case = col_double(),
##   tot_death = col_double(),
##   conf_death = col_double(),
##   prob_death = col_double(),
##   new_death = col_double(),
##   pnew_death = col_double(),
##   created_at = col_character(),
##   consent_cases = col_character(),
##   consent_deaths = col_character()
## )
## 
## *** File has been checked for uniqueness by: state date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: naconf
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 18
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log
## 
## Checking for similarity of: state
## In reference but not in current: 
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
## 97 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log

## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
## 14 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log
dfCaseDeath
## # A tibble: 27,900 x 15
##    date       state tot_cases conf_cases prob_cases new_cases pnew_case
##    <date>     <chr>     <dbl>      <dbl>      <dbl>     <dbl>     <dbl>
##  1 2021-04-01 RI       138406         NA         NA       428         0
##  2 2020-10-15 DC        16166         NA         NA        34         0
##  3 2021-03-16 MN       498926         NA         NA       708       152
##  4 2021-04-16 CT       329062     302775      26287      1062       208
##  5 2020-02-14 DC            0         NA         NA         0        NA
##  6 2020-03-12 NJ           29         NA         NA         6        NA
##  7 2020-06-25 NE        18346         NA         NA       125         0
##  8 2020-02-24 CA           10         NA         NA         0        NA
##  9 2020-11-14 VA       201961     183455      18506      1161       191
## 10 2020-11-17 DE        29755      28473       1282       203        11
## # ... with 27,890 more rows, and 8 more variables: tot_deaths <dbl>,
## #   conf_death <dbl>, prob_death <dbl>, new_deaths <dbl>, pnew_death <dbl>,
## #   created_at <chr>, consent_cases <chr>, consent_deaths <chr>

The function appears to work as intended for previously downloaded data. Next steps are to allow for an external log file and to further parameterize for easier reading and usage.

The function is then run on previous downloaded hospital data, without a reference file:

# Note that output data will include NA and has not yet mapped NYC and NY together
dfHosp_old <- readQCRawFile(fileName="./RInputFiles/Coronavirus/CDC_h_downloaded_210429.csv", 
                            urlType="cdcHosp",
                            getData=FALSE,
                            dfRef=NULL
                            )
## 
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_h_downloaded_210429.csv
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   state = col_character(),
##   date = col_date(format = ""),
##   geocoded_state = col_character()
## )
## i Use `spec()` for the full column specifications.
## 
## *** File has been checked for uniqueness by: state date

dfHosp_old
## # A tibble: 22,435 x 61
##    state date       critical_staffi~ critical_staffi~ critical_staffi~
##    <chr> <date>                <dbl>            <dbl>            <dbl>
##  1 PR    2020-08-04                8               37               22
##  2 VI    2020-08-04                1                1                0
##  3 PR    2020-08-03                7               39               21
##  4 VI    2020-08-03                1                1                0
##  5 PR    2020-08-02                7               35               25
##  6 VI    2020-08-02                1                1                0
##  7 PR    2020-08-01                7               38               22
##  8 VI    2020-08-01                1                1                0
##  9 PR    2020-07-31                9               36               22
## 10 VI    2020-07-31                1                1                0
## # ... with 22,425 more rows, and 56 more variables:
## #   critical_staffing_shortage_anticipated_within_week_yes <dbl>,
## #   critical_staffing_shortage_anticipated_within_week_no <dbl>,
## #   critical_staffing_shortage_anticipated_within_week_not_reported <dbl>,
## #   hospital_onset_covid <dbl>, hospital_onset_covid_coverage <dbl>,
## #   inpatient_beds <dbl>, inpatient_beds_coverage <dbl>,
## #   inpatient_beds_used <dbl>, inpatient_beds_used_coverage <dbl>, inp <dbl>,
## #   inpatient_beds_used_covid_coverage <dbl>,
## #   previous_day_admission_adult_covid_confirmed <dbl>,
## #   previous_day_admission_adult_covid_confirmed_coverage <dbl>,
## #   previous_day_admission_adult_covid_suspected <dbl>,
## #   previous_day_admission_adult_covid_suspected_coverage <dbl>,
## #   previous_day_admission_pediatric_covid_confirmed <dbl>,
## #   previous_day_admission_pediatric_covid_confirmed_coverage <dbl>,
## #   previous_day_admission_pediatric_covid_suspected <dbl>,
## #   previous_day_admission_pediatric_covid_suspected_coverage <dbl>,
## #   staffed_adult_icu_bed_occupancy <dbl>,
## #   staffed_adult_icu_bed_occupancy_coverage <dbl>,
## #   staffed_icu_adult_patients_confirmed_and_suspected_covid <dbl>,
## #   staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage <dbl>,
## #   staffed_icu_adult_patients_confirmed_covid <dbl>,
## #   staffed_icu_adult_patients_confirmed_covid_coverage <dbl>,
## #   hosp_adult <dbl>,
## #   total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage <dbl>,
## #   total_adult_patients_hospitalized_confirmed_covid <dbl>,
## #   total_adult_patients_hospitalized_confirmed_covid_coverage <dbl>,
## #   hosp_ped <dbl>,
## #   total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage <dbl>,
## #   total_pediatric_patients_hospitalized_confirmed_covid <dbl>,
## #   total_pediatric_patients_hospitalized_confirmed_covid_coverage <dbl>,
## #   total_staffed_adult_icu_beds <dbl>,
## #   total_staffed_adult_icu_beds_coverage <dbl>,
## #   inpatient_beds_utilization <dbl>,
## #   inpatient_beds_utilization_coverage <dbl>,
## #   inpatient_beds_utilization_numerator <dbl>,
## #   inpatient_beds_utilization_denominator <dbl>,
## #   percent_of_inpatients_with_covid <dbl>,
## #   percent_of_inpatients_with_covid_coverage <dbl>,
## #   percent_of_inpatients_with_covid_numerator <dbl>,
## #   percent_of_inpatients_with_covid_denominator <dbl>,
## #   inpatient_bed_covid_utilization <dbl>,
## #   inpatient_bed_covid_utilization_coverage <dbl>,
## #   inpatient_bed_covid_utilization_numerator <dbl>,
## #   inpatient_bed_covid_utilization_denominator <dbl>,
## #   adult_icu_bed_covid_utilization <dbl>,
## #   adult_icu_bed_covid_utilization_coverage <dbl>,
## #   adult_icu_bed_covid_utilization_numerator <dbl>,
## #   adult_icu_bed_covid_utilization_denominator <dbl>,
## #   adult_icu_bed_utilization <dbl>, adult_icu_bed_utilization_coverage <dbl>,
## #   adult_icu_bed_utilization_numerator <dbl>,
## #   adult_icu_bed_utilization_denominator <dbl>, geocoded_state <chr>

The function appears to work as intended for reading a hospital data file. Next steps are to use the function to download new hospital data, then to run comparisons to existing file.

The process is run using itself as a reference file:

# Note that output data will include NA and has not yet mapped NYC and NY together
dfHosp <- readQCRawFile(fileName="./RInputFiles/Coronavirus/CDC_h_downloaded_210429.csv", 
                        writeLog="./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log",
                        ovrwriteLog=FALSE,
                        urlType="cdcHosp",
                        getData=FALSE,
                        dfRef=dfHosp_old
                        )
## 
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_h_downloaded_210429.csv
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   state = col_character(),
##   date = col_date(format = ""),
##   geocoded_state = col_character()
## )
## i Use `spec()` for the full column specifications.
## 
## *** File has been checked for uniqueness by: state date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 0
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log
## 
## Checking for similarity of: state
## In reference but not in current: 
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log

## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log
dfHosp
## # A tibble: 22,435 x 61
##    state date       critical_staffi~ critical_staffi~ critical_staffi~
##    <chr> <date>                <dbl>            <dbl>            <dbl>
##  1 PR    2020-08-04                8               37               22
##  2 VI    2020-08-04                1                1                0
##  3 PR    2020-08-03                7               39               21
##  4 VI    2020-08-03                1                1                0
##  5 PR    2020-08-02                7               35               25
##  6 VI    2020-08-02                1                1                0
##  7 PR    2020-08-01                7               38               22
##  8 VI    2020-08-01                1                1                0
##  9 PR    2020-07-31                9               36               22
## 10 VI    2020-07-31                1                1                0
## # ... with 22,425 more rows, and 56 more variables:
## #   critical_staffing_shortage_anticipated_within_week_yes <dbl>,
## #   critical_staffing_shortage_anticipated_within_week_no <dbl>,
## #   critical_staffing_shortage_anticipated_within_week_not_reported <dbl>,
## #   hospital_onset_covid <dbl>, hospital_onset_covid_coverage <dbl>,
## #   inpatient_beds <dbl>, inpatient_beds_coverage <dbl>,
## #   inpatient_beds_used <dbl>, inpatient_beds_used_coverage <dbl>, inp <dbl>,
## #   inpatient_beds_used_covid_coverage <dbl>,
## #   previous_day_admission_adult_covid_confirmed <dbl>,
## #   previous_day_admission_adult_covid_confirmed_coverage <dbl>,
## #   previous_day_admission_adult_covid_suspected <dbl>,
## #   previous_day_admission_adult_covid_suspected_coverage <dbl>,
## #   previous_day_admission_pediatric_covid_confirmed <dbl>,
## #   previous_day_admission_pediatric_covid_confirmed_coverage <dbl>,
## #   previous_day_admission_pediatric_covid_suspected <dbl>,
## #   previous_day_admission_pediatric_covid_suspected_coverage <dbl>,
## #   staffed_adult_icu_bed_occupancy <dbl>,
## #   staffed_adult_icu_bed_occupancy_coverage <dbl>,
## #   staffed_icu_adult_patients_confirmed_and_suspected_covid <dbl>,
## #   staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage <dbl>,
## #   staffed_icu_adult_patients_confirmed_covid <dbl>,
## #   staffed_icu_adult_patients_confirmed_covid_coverage <dbl>,
## #   hosp_adult <dbl>,
## #   total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage <dbl>,
## #   total_adult_patients_hospitalized_confirmed_covid <dbl>,
## #   total_adult_patients_hospitalized_confirmed_covid_coverage <dbl>,
## #   hosp_ped <dbl>,
## #   total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage <dbl>,
## #   total_pediatric_patients_hospitalized_confirmed_covid <dbl>,
## #   total_pediatric_patients_hospitalized_confirmed_covid_coverage <dbl>,
## #   total_staffed_adult_icu_beds <dbl>,
## #   total_staffed_adult_icu_beds_coverage <dbl>,
## #   inpatient_beds_utilization <dbl>,
## #   inpatient_beds_utilization_coverage <dbl>,
## #   inpatient_beds_utilization_numerator <dbl>,
## #   inpatient_beds_utilization_denominator <dbl>,
## #   percent_of_inpatients_with_covid <dbl>,
## #   percent_of_inpatients_with_covid_coverage <dbl>,
## #   percent_of_inpatients_with_covid_numerator <dbl>,
## #   percent_of_inpatients_with_covid_denominator <dbl>,
## #   inpatient_bed_covid_utilization <dbl>,
## #   inpatient_bed_covid_utilization_coverage <dbl>,
## #   inpatient_bed_covid_utilization_numerator <dbl>,
## #   inpatient_bed_covid_utilization_denominator <dbl>,
## #   adult_icu_bed_covid_utilization <dbl>,
## #   adult_icu_bed_covid_utilization_coverage <dbl>,
## #   adult_icu_bed_covid_utilization_numerator <dbl>,
## #   adult_icu_bed_covid_utilization_denominator <dbl>,
## #   adult_icu_bed_utilization <dbl>, adult_icu_bed_utilization_coverage <dbl>,
## #   adult_icu_bed_utilization_numerator <dbl>,
## #   adult_icu_bed_utilization_denominator <dbl>, geocoded_state <chr>

The function then downloads and processes new data, cached to avoid multiple hits against the server:

# Note that output data will include NA and has not yet mapped NYC and NY together
dfHosp <- readQCRawFile(fileName="./RInputFiles/Coronavirus/CDC_h_downloaded_210509.csv", 
                        writeLog="./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log",
                        ovrwriteLog=FALSE,
                        urlType="cdcHosp",
                        getData=TRUE,
                        dfRef=dfHosp_old
                        )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   state = col_character(),
##   date = col_date(format = ""),
##   geocoded_state = col_character()
## )
## i Use `spec()` for the full column specifications.
## 
## *** File has been checked for uniqueness by: state date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: previous_day_admission_adult_covid_confirmed_18-19 previous_day_admission_adult_covid_confirmed_18-19_coverage previous_day_admission_adult_covid_confirmed_20-29 previous_day_admission_adult_covid_confirmed_20-29_coverage previous_day_admission_adult_covid_confirmed_30-39 previous_day_admission_adult_covid_confirmed_30-39_coverage previous_day_admission_adult_covid_confirmed_40-49 previous_day_admission_adult_covid_confirmed_40-49_coverage previous_day_admission_adult_covid_confirmed_50-59 previous_day_admission_adult_covid_confirmed_50-59_coverage previous_day_admission_adult_covid_confirmed_60-69 previous_day_admission_adult_covid_confirmed_60-69_coverage previous_day_admission_adult_covid_confirmed_70-79 previous_day_admission_adult_covid_confirmed_70-79_coverage previous_day_admission_adult_covid_confirmed_80+ previous_day_admission_adult_covid_confirmed_80+_coverage previous_day_admission_adult_covid_confirmed_unknown previous_day_admission_adult_covid_confirmed_unknown_coverage previous_day_admission_adult_covid_suspected_18-19 previous_day_admission_adult_covid_suspected_18-19_coverage previous_day_admission_adult_covid_suspected_20-29 previous_day_admission_adult_covid_suspected_20-29_coverage previous_day_admission_adult_covid_suspected_30-39 previous_day_admission_adult_covid_suspected_30-39_coverage previous_day_admission_adult_covid_suspected_40-49 previous_day_admission_adult_covid_suspected_40-49_coverage previous_day_admission_adult_covid_suspected_50-59 previous_day_admission_adult_covid_suspected_50-59_coverage previous_day_admission_adult_covid_suspected_60-69 previous_day_admission_adult_covid_suspected_60-69_coverage previous_day_admission_adult_covid_suspected_70-79 previous_day_admission_adult_covid_suspected_70-79_coverage previous_day_admission_adult_covid_suspected_80+ previous_day_admission_adult_covid_suspected_80+_coverage previous_day_admission_adult_covid_suspected_unknown previous_day_admission_adult_covid_suspected_unknown_coverage
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 15
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log
## 
## Checking for similarity of: state
## In reference but not in current: 
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
## 6 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log

## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
## 63 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log
glimpse(dfHosp)
## Rows: 23,230
## Columns: 97
## $ state                                                                        <chr> ...
## $ date                                                                         <date> ...
## $ critical_staffing_shortage_today_yes                                         <dbl> ...
## $ critical_staffing_shortage_today_no                                          <dbl> ...
## $ critical_staffing_shortage_today_not_reported                                <dbl> ...
## $ critical_staffing_shortage_anticipated_within_week_yes                       <dbl> ...
## $ critical_staffing_shortage_anticipated_within_week_no                        <dbl> ...
## $ critical_staffing_shortage_anticipated_within_week_not_reported              <dbl> ...
## $ hospital_onset_covid                                                         <dbl> ...
## $ hospital_onset_covid_coverage                                                <dbl> ...
## $ inpatient_beds                                                               <dbl> ...
## $ inpatient_beds_coverage                                                      <dbl> ...
## $ inpatient_beds_used                                                          <dbl> ...
## $ inpatient_beds_used_coverage                                                 <dbl> ...
## $ inp                                                                          <dbl> ...
## $ inpatient_beds_used_covid_coverage                                           <dbl> ...
## $ previous_day_admission_adult_covid_confirmed                                 <dbl> ...
## $ previous_day_admission_adult_covid_confirmed_coverage                        <dbl> ...
## $ previous_day_admission_adult_covid_suspected                                 <dbl> ...
## $ previous_day_admission_adult_covid_suspected_coverage                        <dbl> ...
## $ previous_day_admission_pediatric_covid_confirmed                             <dbl> ...
## $ previous_day_admission_pediatric_covid_confirmed_coverage                    <dbl> ...
## $ previous_day_admission_pediatric_covid_suspected                             <dbl> ...
## $ previous_day_admission_pediatric_covid_suspected_coverage                    <dbl> ...
## $ staffed_adult_icu_bed_occupancy                                              <dbl> ...
## $ staffed_adult_icu_bed_occupancy_coverage                                     <dbl> ...
## $ staffed_icu_adult_patients_confirmed_and_suspected_covid                     <dbl> ...
## $ staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage            <dbl> ...
## $ staffed_icu_adult_patients_confirmed_covid                                   <dbl> ...
## $ staffed_icu_adult_patients_confirmed_covid_coverage                          <dbl> ...
## $ hosp_adult                                                                   <dbl> ...
## $ total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage     <dbl> ...
## $ total_adult_patients_hospitalized_confirmed_covid                            <dbl> ...
## $ total_adult_patients_hospitalized_confirmed_covid_coverage                   <dbl> ...
## $ hosp_ped                                                                     <dbl> ...
## $ total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage <dbl> ...
## $ total_pediatric_patients_hospitalized_confirmed_covid                        <dbl> ...
## $ total_pediatric_patients_hospitalized_confirmed_covid_coverage               <dbl> ...
## $ total_staffed_adult_icu_beds                                                 <dbl> ...
## $ total_staffed_adult_icu_beds_coverage                                        <dbl> ...
## $ inpatient_beds_utilization                                                   <dbl> ...
## $ inpatient_beds_utilization_coverage                                          <dbl> ...
## $ inpatient_beds_utilization_numerator                                         <dbl> ...
## $ inpatient_beds_utilization_denominator                                       <dbl> ...
## $ percent_of_inpatients_with_covid                                             <dbl> ...
## $ percent_of_inpatients_with_covid_coverage                                    <dbl> ...
## $ percent_of_inpatients_with_covid_numerator                                   <dbl> ...
## $ percent_of_inpatients_with_covid_denominator                                 <dbl> ...
## $ inpatient_bed_covid_utilization                                              <dbl> ...
## $ inpatient_bed_covid_utilization_coverage                                     <dbl> ...
## $ inpatient_bed_covid_utilization_numerator                                    <dbl> ...
## $ inpatient_bed_covid_utilization_denominator                                  <dbl> ...
## $ adult_icu_bed_covid_utilization                                              <dbl> ...
## $ adult_icu_bed_covid_utilization_coverage                                     <dbl> ...
## $ adult_icu_bed_covid_utilization_numerator                                    <dbl> ...
## $ adult_icu_bed_covid_utilization_denominator                                  <dbl> ...
## $ adult_icu_bed_utilization                                                    <dbl> ...
## $ adult_icu_bed_utilization_coverage                                           <dbl> ...
## $ adult_icu_bed_utilization_numerator                                          <dbl> ...
## $ adult_icu_bed_utilization_denominator                                        <dbl> ...
## $ geocoded_state                                                               <chr> ...
## $ `previous_day_admission_adult_covid_confirmed_18-19`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_18-19_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_20-29`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_20-29_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_30-39`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_30-39_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_40-49`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_40-49_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_50-59`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_50-59_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_60-69`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_60-69_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_70-79`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_70-79_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_80+`                           <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_80+_coverage`                  <dbl> ...
## $ previous_day_admission_adult_covid_confirmed_unknown                         <dbl> ...
## $ previous_day_admission_adult_covid_confirmed_unknown_coverage                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_18-19`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_18-19_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_20-29`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_20-29_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_30-39`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_30-39_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_40-49`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_40-49_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_50-59`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_50-59_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_60-69`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_60-69_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_70-79`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_70-79_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_80+`                           <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_80+_coverage`                  <dbl> ...
## $ previous_day_admission_adult_covid_suspected_unknown                         <dbl> ...
## $ previous_day_admission_adult_covid_suspected_unknown_coverage                <dbl> ...

There are a significant number of new columns added to the hospital download data, primarily related to age breakdowns for admissions in the previous day. The download is otherwise as expected, and the comparison process appears to be running well.

The keyAggMapper list has been updated so that it is a list with one entry per urlType, allowing for use like all other elements in readCRawFile().

The function printLog has been added to allow for printing to an external log file.

Next, a function is written to process a raw data file, including:

  • Selecting a subset of fields (rename if needed)
  • Combining multiple records (e.g., NYC and NYS summing to NY)
  • Selecting a subset of records (e.g., limit to 50 states and DC)
# Function to process a raw file
processRawFile <- function(df, 
                           vecRename=c(), 
                           vecSelect=NULL,
                           lstCombo=list(), 
                           lstFilter=list()
                           ) {
    
    # FUNCTION ARGUMENTS:
    # df: the raw data frame or tibble
    # vecRename: vector for renaming c('existing name'='new name'), can be any length from 0 to ncol(df)
    # vecSelect: vector of columns to select (run after vecRename), NULL means select all columns
    # lstCombo: a nested list of combinations to be applied
    #           each element of the list should include comboVar, uqVars, vecCombo, and fn
    # lstFilter: a list for filtering records, of form list("field"=c("allowed values"))
    
    # STEP 1: Rename and select variables (selection occurs AFTER renaming)
    dfProcess <- df %>%
        colRenamer(vecRename=vecRename) %>%
        colSelector(vecSelect=vecSelect)
    
    # STEP 2: Combine multiple records to a single record
    for (ctr in seq_along(lstCombo)) {
        dfProcess <- dfProcess %>%
            combineRows(comboVar=lstCombo[[ctr]]$comboVar, 
                        uqVars=lstCombo[[ctr]]$uqVars, 
                        vecCombo=lstCombo[[ctr]]$vecCombo, 
                        fn=lstCombo[[ctr]]$fn
                        )
    }
    
    # STEP 3: Filter records
    qcOrig <- dfProcess %>% 
        summarize(across(where(is.numeric), sum, na.rm=TRUE), n=n()) %>% 
        mutate(isType="before")
    dfProcess <- dfProcess %>% 
        rowFilter(lstFilter=lstFilter)
    
    # STEP 4: Report on differences
    cat("\nColumn sums before and after applying filtering rules:\n")
    dfProcess %>% 
        summarize(across(where(is.numeric), sum, na.rm=TRUE), n=n()) %>% 
        mutate(isType="after") %>%
        bind_rows(qcOrig) %>%
        arrange(desc(isType)) %>%
        bind_rows(mutate(summarize(., across(where(is.numeric), function(x) (max(x)-min(x))/max(x))), 
                         isType="pctchg"
                         )
                  ) %>%
        select(isType, everything()) %>%
        print()
    cat("\n")
    
    # Return the processed data file
    dfProcess
    
}

The function is then tested on the cases and deaths data as well as the hospital data:

dfProcess_dc_210502 <- dfCaseDeath %>%
    processRawFile(vecSelect=c("date", "state", "tot_cases", "tot_deaths", "new_cases", "new_deaths"), 
                   lstCombo=list("nyc"=list("comboVar"="state", 
                                            "uqVars"="date", 
                                            "vecCombo"=c("NY"="NY", "NYC"="NY"),
                                            "fn"=specNASum
                                            )
                                 ), 
                   lstFilter=list("state"=c(state.abb, "DC"))
                   )
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 6
##   isType tot_cases tot_deaths new_cases   new_deaths         n
##   <chr>      <dbl>      <dbl>     <dbl>        <dbl>     <dbl>
## 1 before   5.08e+9    1.07e+8   3.21e+7 558830       27435    
## 2 after    5.06e+9    1.06e+8   3.19e+7 556355       23715    
## 3 pctchg   4.40e-3    3.81e-3   4.47e-3      0.00443     0.136
dfProcess_dc_210502
## # A tibble: 23,715 x 6
##    date       state tot_cases tot_deaths new_cases new_deaths
##    <date>     <chr>     <dbl>      <dbl>     <dbl>      <dbl>
##  1 2021-04-01 RI       138406       2627       428          2
##  2 2020-10-15 DC        16166        638        34          0
##  3 2021-03-16 MN       498926       6817       708          2
##  4 2021-04-16 CT       329062       7995      1062          5
##  5 2020-02-14 DC            0          0         0          0
##  6 2020-03-12 NJ           29          1         6          0
##  7 2020-06-25 NE        18346        260       125          3
##  8 2020-02-24 CA           10          0         0          0
##  9 2020-11-14 VA       201961       3800      1161          1
## 10 2020-11-17 DE        29755        742       203          3
## # ... with 23,705 more rows
dfProcess_h <- dfHosp %>%
    processRawFile(vecSelect=c("date", "state", "inp", "hosp_adult", "hosp_ped"), 
                   lstFilter=list("state"=c(state.abb, "DC"))
                   )
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 5
##   isType     inp hosp_adult    hosp_ped          n
##   <chr>    <dbl>      <dbl>       <dbl>      <dbl>
## 1 before 2.57e+7    1.99e+7 436353      23230     
## 2 after  2.56e+7    1.98e+7 426239      22395     
## 3 pctchg 5.60e-3    5.66e-3      0.0232     0.0359
dfProcess_h
## # A tibble: 22,395 x 5
##    date       state   inp hosp_adult hosp_ped
##    <date>     <chr> <dbl>      <dbl>    <dbl>
##  1 2020-07-27 ND       59         NA       NA
##  2 2020-07-26 ND       45         NA       NA
##  3 2020-07-25 ND       43         NA       NA
##  4 2020-07-24 NE      219        202        0
##  5 2020-07-21 ND       46         NA       NA
##  6 2020-07-09 HI       20         NA       NA
##  7 2020-06-24 ME       67         NA       NA
##  8 2020-06-12 MD     1179         NA       NA
##  9 2020-06-10 OR      149         NA       NA
## 10 2020-06-08 NH       94         NA       NA
## # ... with 22,385 more rows

The processing functions appear to be working as intended. Next steps are to join the data frames then to add population data and calculate per capita metrics.

A function is written to join files and convert metrics to per capita:

combineFiles <- function(lst, 
                         fn=dplyr::full_join, 
                         byVars=NULL,
                         ...
                         ) {

    # lst: A list containing one or more files to be joined
    # fn: A function for joining files
    # byVars: character string "by variables", which must be consistent across files
    #         NULL means infer from data as is standard in dplyr merges
    # ...: other arguments to be passed to fn by way of reduce
    
    purrr::reduce(lst, .f=fn, by=byVars, ...)
    
}


createPerCapita <- function(lst, 
                            uqBy,
                            popData,
                            lstSortBy=uqBy,
                            fnJoin=dplyr::full_join, 
                            popJoinBy="state",
                            popVar="pop",
                            k=7,
                            mapper=perCapMapper,
                            ...
                            ) {
    
    # FUNCTION ARGUMENTS:
    # lst: A list containing one or more files to be joined OR a data frame that is already joined
    # uqBy: character string that the input file is unique by (will be the join keys if a list is passed)
    # popData: file containing population data that can be joined to the processed lst
    # lstSortBy: the sorting that should be used for creating rolling metrics
    # fnJoin: The function to be used for joining files
    # popJoinBy: character string for the variable(s) to be used in joining popData to lst
    # popVar: character string for the variable in popData that represents population
    # k: time perior for rolling aggregations
    # mapper: mapping file of c('current name'='per capita name') for mapping variables
    # ...: other arguments to be passed to combineFiles()
    
    # Step 1: If a list has been passed, use a joining process to create a data frame
    if ("list" %in% class(lst)) lst <- combineFiles(lst, byVars=uqBy, fn=fnJoin, ...)
    
    # Step 2: Sort the data using sortBy
    df <- dplyr::arrange(lst, across(all_of(lstSortBy)))
    
    # Step 3: Check that all variables other than uqBy can be mapped using mapper
    keyVars <- setdiff(names(df), uqBy)
    if (any(isFALSE(keyVars %in% mapper))) stop("\nVariable is missing in per capita mapper file\n")
    
    # Step 4: Run the per capita mapping process
    df <- helperMakePerCapita(df, 
                              mapVars=mapper[keyVars], 
                              popData=popData, 
                              k=k
                              )
    
    # Return the data frame
    df
    
}

The function is then tested using the processed data files:

dfPerCap_210502 <- createPerCapita(list(dfProcess_dc_210502, dfProcess_h), 
                                   uqBy=c("state", "date"), 
                                   popData=stateDataCDC
                                   )
dfPerCap_210502
## # A tibble: 24,285 x 23
##    date       state tot_cases tot_deaths new_cases new_deaths   inp hosp_adult
##    <date>     <chr>     <dbl>      <dbl>     <dbl>      <dbl> <dbl>      <dbl>
##  1 2020-01-01 AL           NA         NA        NA         NA    NA         NA
##  2 2020-01-01 HI           NA         NA        NA         NA     0         NA
##  3 2020-01-01 IN           NA         NA        NA         NA     0         NA
##  4 2020-01-01 LA           NA         NA        NA         NA    NA         NA
##  5 2020-01-01 MN           NA         NA        NA         NA     0         NA
##  6 2020-01-01 MT           NA         NA        NA         NA     0         NA
##  7 2020-01-01 NC           NA         NA        NA         NA     0         NA
##  8 2020-01-01 TX           NA         NA        NA         NA     0         NA
##  9 2020-01-02 AL           NA         NA        NA         NA    NA         NA
## 10 2020-01-02 HI           NA         NA        NA         NA     0         NA
## # ... with 24,275 more rows, and 15 more variables: hosp_ped <dbl>, tcpm <dbl>,
## #   tdpm <dbl>, cpm <dbl>, dpm <dbl>, hpm <dbl>, ahpm <dbl>, phpm <dbl>,
## #   tcpm7 <dbl>, tdpm7 <dbl>, cpm7 <dbl>, dpm7 <dbl>, hpm7 <dbl>, ahpm7 <dbl>,
## #   phpm7 <dbl>

Next steps are to update the helperMakePerCapita() function so that it does not need to assume that data are unique by state and sorted by date:

createPerCapita <- function(lst, 
                            uqBy,
                            popData,
                            lstSortBy=uqBy,
                            fnJoin=dplyr::full_join, 
                            popJoinBy="state",
                            popVar="pop",
                            k=7,
                            mapper=perCapMapper,
                            ...
                            ) {
    
    # FUNCTION ARGUMENTS:
    # lst: A list containing one or more files to be joined OR a data frame that is already joined
    # uqBy: character string that the input file is unique by (will be the join keys if a list is passed)
    # popData: file containing population data that can be joined to the processed lst
    # lstSortBy: the sorting that should be used for creating rolling metrics
    # fnJoin: The function to be used for joining files
    # popJoinBy: character string for the variable(s) to be used in joining popData to lst
    # popVar: character string for the variable in popData that represents population
    # k: time perior for rolling aggregations
    # mapper: mapping file of c('current name'='per capita name') for mapping variables
    # ...: other arguments to be passed to combineFiles()
    
    # Step 1: If a list has been passed, use a joining process to create a data frame
    if ("list" %in% class(lst)) lst <- combineFiles(lst, byVars=uqBy, fn=fnJoin, ...)
    
    # Step 2: Sort the data using sortBy
    df <- dplyr::arrange(lst, across(all_of(lstSortBy)))
    
    # Step 3: Check that all variables other than uqBy can be mapped using mapper
    keyVars <- setdiff(names(df), uqBy)
    if (any(isFALSE(keyVars %in% mapper))) stop("\nVariable is missing in per capita mapper file\n")
    
    # Step 4: Run the per capita mapping process
    df <- helperMakePerCapita(df, 
                              mapVars=mapper[keyVars], 
                              popData=popData, 
                              k=k, 
                              byVar=popJoinBy, 
                              sortVar=setdiff(lstSortBy, popJoinBy), 
                              popVar=popVar, 
                              mult=1000000
                              )
    
    # Return the data frame
    df
    
}


# Helper function to create per capita metrics
helperPerCapita <- function(df, 
                            origVar, 
                            newName,
                            byVar="state",
                            popVar="pop",
                            popData=stateData,
                            mult=1000000
                            ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame currently being processed
    # origVar: the variables to be converted to per capita
    # newName: the new per capita variable name
    # byVar: the variable that will be merged by
    # popVar: the name of the population variable in the popData file
    # popData: the file containing the population data
    # mult: the multiplier, so that the metric is "per mult people"
    
    # Create the per capita variable
    df %>%
        inner_join(select_at(popData, vars(all_of(c(byVar, popVar)))), by=byVar) %>%
        mutate(!!newName:=mult*get(origVar)/get(popVar)) %>%
        select(-all_of(popVar))
    
}


# Helper function to create rolling aggregates
helperRollingAgg <- function(df, 
                             origVar, 
                             newName,
                             func=zoo::rollmean,
                             k=7, 
                             fill=NA, 
                             ...
                             ) {
    
    # FUNCTION ARGUMENTS:
    # df: the data frame containing the data
    # origVar: the original data column name
    # newName: the new variable column name
    # func: the function to be applied (zoo::rollmean will be by far the most common)
    # k: the periodicity (k=7 is rolling weekly data)
    # fill: how to fill leading.trailing data to maintain the same vector lengths
    # ...: any other arguments to be passed to func
    
    # Create the appropriate variable
    df %>%
        mutate(!!newName:=func(get(origVar), k=k, fill=fill, ...))
    
}


# Function to add per capita and rolling to the base data frame
helperMakePerCapita <- function(df, 
                                mapVars=c("cases"="cpm", "deaths"="dpm"),
                                popData=stateData,
                                k=7, 
                                byVar="state", 
                                sortVar="date",
                                popVar="pop", 
                                mult=1000000
                                ) {
    
    # FUNCTION ARGUMENTS:
    # df: the initial data frame for conversion
    # mapVars: named vector of variables to be converted 'original name'='converted name'
    # k: the rolling time period to use
    # byVar: grouping variable for df
    # sortVar: each element of byVar should be sorted by sortVar prior to rolling aggregations
    # popVar: column name in popData the represents population
    # mult: unit for 'per capita' variable (1,000,000 will make 'per million' metrics)
    
    # Create the variables for per capita
    for (origVar in names(mapVars)) {
        df <- df %>% 
            helperPerCapita(origVar=origVar, 
                            newName=mapVars[origVar], 
                            popData=popData, 
                            byVar=byVar, 
                            popVar=popVar, 
                            mult=mult
                            )
    }
    
    # Group and arrange the data prior to creating rolling aggregates
    df <- df %>% 
        group_by(across(.cols=all_of(byVar))) %>% 
        arrange(across(.cols=all_of(sortVar)))

    # Create the rolling variables
    for (newVar in mapVars) {
        df <- df %>% 
            helperRollingAgg(origVar=newVar, newName=paste0(newVar, k), k=k)
    }
    
    # Return the updated data frame, ungrouped
    df %>%
        ungroup()
    
}

The function is then tested using the processed data files:

dfPerCap_210502_v2 <- createPerCapita(list(dfProcess_dc_210502, dfProcess_h), 
                                      uqBy=c("state", "date"), 
                                      popData=stateDataCDC
                                      )
dfPerCap_210502_v2
## # A tibble: 24,285 x 23
##    date       state tot_cases tot_deaths new_cases new_deaths   inp hosp_adult
##    <date>     <chr>     <dbl>      <dbl>     <dbl>      <dbl> <dbl>      <dbl>
##  1 2020-01-01 AL           NA         NA        NA         NA    NA         NA
##  2 2020-01-01 HI           NA         NA        NA         NA     0         NA
##  3 2020-01-01 IN           NA         NA        NA         NA     0         NA
##  4 2020-01-01 LA           NA         NA        NA         NA    NA         NA
##  5 2020-01-01 MN           NA         NA        NA         NA     0         NA
##  6 2020-01-01 MT           NA         NA        NA         NA     0         NA
##  7 2020-01-01 NC           NA         NA        NA         NA     0         NA
##  8 2020-01-01 TX           NA         NA        NA         NA     0         NA
##  9 2020-01-02 AL           NA         NA        NA         NA    NA         NA
## 10 2020-01-02 HI           NA         NA        NA         NA     0         NA
## # ... with 24,275 more rows, and 15 more variables: hosp_ped <dbl>, tcpm <dbl>,
## #   tdpm <dbl>, cpm <dbl>, dpm <dbl>, hpm <dbl>, ahpm <dbl>, phpm <dbl>,
## #   tcpm7 <dbl>, tdpm7 <dbl>, cpm7 <dbl>, dpm7 <dbl>, hpm7 <dbl>, ahpm7 <dbl>,
## #   phpm7 <dbl>
identical(dfPerCap_210502_v2, dfPerCap_210502)
## [1] TRUE

The function can now take arguments for the unique variable, sorting variable, and population variable. Results are the same as previous. The integrated file is now available for use in the main routine. Next steps are to integrate the new functions in a main function that can download, read, process, integrate, and analyze CDC daily data.

The integrated function readRunCDCDaily() is updated:

# Helper function for glimpsing
glimpseFile <- function(x, txt) {
    cat(txt)
    glimpse(x)
}


# Helper function to glimpse to a logfile
glimpseLog <- function(df, txt="", logFile=NULL, append=TRUE) {
    if (is.null(logFile)) glimpseFile(df, txt=txt)
    else capture.output(glimpseFile(df, txt=txt), file=logFile, append=append)
}


# Helper function to get the clusters from a clustering file
getClusters <- function(clData, hier=FALSE, kCut=0, reAssign=list()) {        
    
    # If hierarchical clusters, cut the tree, otherwise use the output object directly
    if (isTRUE(hier)) useClusters <- cutree(clData, k=kCut)
    else if (is.na(hier)) useClusters <- clData$objCluster
    else useClusters <- clData$objCluster$cluster
        
    # If requested, manually assign clusters to the cluster for another state
    for (xNum in seq_len(length(reAssign))) {
        useClusters[names(reAssign)[xNum]] <- useClusters[reAssign[[xNum]]]
    }
    
    # Return the clusters
    useClusters
    
}


# Function to download/load, process, segment, and analyze data for CDC daily
readRunCDCDaily <- function(thruLabel, 
                            downloadTo=list("cdcDaily"=NA, "cdcHosp"=NA), 
                            readFrom=downloadTo, 
                            compareFile=list("cdcDaily"=NA, "cdcHosp"=NA),
                            writeLog=NULL,
                            ovrwriteLog=TRUE,
                            dfPerCapita=NULL,
                            useClusters=NULL,
                            hierarchical=TRUE,
                            returnList=!isTRUE(hierarchical), 
                            kCut=6,
                            reAssignState=vector("list", 0),
                            makeCumulativePlots=TRUE,
                            skipAssessmentPlots=FALSE,
                            ...
                            ) {
    
    # FUNCTION ARGUMENTS:
    # thruLabel: the label for when the data are through (e.g., "Aug 30, 2020")
    # donwloadTo: named list for locations to download data (cdcDaily and cdcHosp)
    #             NA means do not download data for that particular element
    # readFrom: named list for locations to read data from (defaults to donwloadTo)
    # compareFile: named list for the reference file to be used for cdcDaily and cdcHosp 
    #              NA means do not use a reference file for that element
    # dateChangePlot: boolean, should changes in dates be captured as a plot rather than as a list?
    # dateMetricPrint: boolean, should the changes by date and metric be printed to the main log?
    # writeLog: name of a separate log file for capturing detailed data on changes between files
    #           NULL means no detailed data captured
    # ovrwriteLog: boolean, should the log file be overwritten and started again from scratch?
    # dfPerCapita: file can be passed directly, which bypasses the loading and processing steps
    # useClusters: file containing clusters by state (NULL means make the clusters from the data)
    # hierarchical: boolean, should hierarchical clusters be produced (if FALSE, will be k-means)?
    # returnList: boolean, should a list be returned or just the cluster object?
    #             refers to what is returned by clusterStates(); the main function always returns a list
    # kCut: number of segments when cutting the hierarchical tree
    # reAssignState: mapping file for assigning a state to another state's cluster
    #                format list("stateToChange"="stateClusterToAssign")
    # makeCumulativePlots: whether to make plots of cumulative metrics
    # skipAssessmentPlots: boolean to skip the plots for assessClusters()
    #                      especially useful if just exploring dendrograms or silhouette widths
    # ...: arguments to be passed to clusterStates(), will be used only if useClusters is NULL
    
    # STEP 0: Function to create the return list
    createFinalList <- function(plots=TRUE) {
        list(stateData=stateData, 
             dfRaw=dfRawList, 
             dfProcess=dfProcessList, 
             dfPerCapita=dfPerCapita, 
             useClusters=useClusters, 
             plotData=if(plots) plotData else NULL, 
             consolidatedPlotData=if(plots) consolidatedPlotData else NULL, 
             clCum=if(plots) clCum else NULL
             )
    }
    
    # STEP 1: Get state data
    stateData <- getStateData()
    
    # If a log file is requested, create the log file (allows for append=TRUE for al downstream functions)
    if (!is.null(writeLog)) {
        txt <- paste0("\n\n*** Writing log at: ", Sys.time() %>% lubridate::with_tz(tzone="UTC"), "Z ***\n\n")
        capture.output(cat(txt), file=writeLog, append=isFALSE(ovrwriteLog))
    }

    # Get the data types to be used (will be the elements of readFrom) and create a file storage list
    elemUsed <- names(readFrom)
    dfRawList <- vector("list", length=length(elemUsed)) %>% purrr::set_names(elemUsed)
    dfProcessList <- vector("list", length=length(elemUsed)) %>% purrr::set_names(elemUsed)
    
    # Steps 2-4 are run only is dfPerCapita has not been passed
    if (is.null(dfPerCapita)) {
        
        # Step 2: Download and QC all of the requested data
        for (elem in elemUsed) {
            dfRawList[[elem]] <- readQCRawFile(fileName=readFrom[[elem]], 
                                               writeLog=writeLog, 
                                               ovrwriteLog=FALSE,
                                               urlType=elem, 
                                               getData=if(is.na(downloadTo[[elem]])) FALSE else TRUE, 
                                               dfRef=compareFile[[elem]]
                                               )
            glimpseLog(dfRawList[[elem]], txt=paste0("\nRaw file for ", elem, ":\n"), logFile=writeLog)
        }
        
        # Step 3: Process all of the requested data
        for (elem in elemUsed) {
            dfProcessList[[elem]] <- processRawFile(dfRawList[[elem]], 
                                                    vecRename=c(), # already handled in readQCRawFile()
                                                    vecSelect=vecSelectMapper[[elem]], 
                                                    lstCombo=lstComboMapper[[elem]], 
                                                    lstFilter=lstFilterMapper[[elem]]
                                                    )
            glimpseLog(dfProcessList[[elem]], txt=paste0("\nProcessed for ", elem, ":\n"), logFile=writeLog)
        }
        
        # Step 4: Integrate in to a dfPerCapita file and glimpse (to specified log file)
        dfPerCapita <- createPerCapita(dfProcessList, uqBy=c("state", "date"), popData=stateData)
        glimpseLog(dfPerCapita, txt="\nIntegrated per capita data file:\n", logFile=writeLog)

    } else {
        dfRawList <- NULL
        dfProcessList <- NULL
    }
    
    # STEP 5: Create the clusters (if they have not been passed)
    if (is.null(useClusters)) {
        clData <- clusterStates(df=dfPerCapita, hierarchical=hierarchical, returnList=returnList, ...)
        useClusters <- getClusters(clData, hier=hierarchical, kCut=kCut, reAssign=reAssignState)
    }
    
    # STEP 5a: Stop the process and return what is available if skipAssessmentPlots is TRUE
    if (skipAssessmentPlots) 
        return(createFinalList(plots=FALSE))
    
    # STEP 6: Create the cluster assessments
    plotData <- assessClusters(useClusters, 
                               dfState=stateData, 
                               dfBurden=dfPerCapita, 
                               thruLabel=thruLabel,
                               plotsTogether=TRUE
    )
    
    
    # STEP 7: Plot the consolidated metrics
    subT <- "Cases: new cases, Deaths: new deaths, Hosp: total in hospital (not new), Tests: new tests"
    consolidatedPlotData <- plotConsolidatedMetrics(plotData, 
                                                    varMain=c("state", "cluster", "date", "pop",
                                                              "cases", "deaths", "hosp", "tests"
                                                    ), 
                                                    subT=subT, 
                                                    nrowPlot2=2
    )
    
    # STEP 8: Create cumulative metrics if requested
    if (makeCumulativePlots) {
        consPos <- consolidatedPlotData %>%
            ungroup() %>%
            select(state, cluster, date, name, vpm7) %>%
            arrange(state, cluster, date, name) %>%
            pivot_wider(-vpm7, names_from="name", values_from="vpm7") %>%
            mutate(pctpos=cases/tests) %>%
            pivot_longer(-c(state, cluster, date), values_to="vpm7") %>%
            filter(!is.na(vpm7))
        clCum <- makeCumulative(consPos)
        plotCumulativeData(clCum, 
                           keyMetricp2="", 
                           flagsp2="", 
                           makep1=TRUE, 
                           makep2=FALSE
        )
        plotCumulativeData(clCum, 
                           keyMetricp2="deaths", 
                           flagsp2=findFlagStates(clCum, keyMetricVal = "deaths")
        )
        plotCumulativeData(clCum, 
                           keyMetricp2="cases", 
                           flagsp2=findFlagStates(clCum, keyMetricVal = "cases")
        )
        plotCumulativeData(clCum, 
                           keyMetricp2="tests", 
                           flagsp2=findFlagStates(clCum, keyMetricVal = "tests")
        )
    } else {
        clCum <- NULL
    }
    
    # STEP 9: Return a list of the key data
    return(createFinalList(plots=TRUE))
    
}

The function is tested for previously downloaded data and existing clusters, with no cluster assessment:

readList <- list("cdcDaily"="./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv", 
                 "cdcHosp"="./RInputFiles/Coronavirus/CDC_h_downloaded_210509.csv"
                 )

cdc_daily_test <- readRunCDCDaily(thruLabel="May 2, 2021", 
                                  readFrom=readList, 
                                  compareFile=list("cdcDaily"=colRenamer(dfRaw_dc_210414,
                                                                         c('new_case'='new_cases', 
                                                                           'tot_death'='tot_deaths',
                                                                           'new_death'='new_deaths'
                                                                           )
                                                                         ), 
                                                   "cdcHosp"=dfHosp_old
                                                   ), 
                                  writeLog=NULL, 
                                  ovrwriteLog=TRUE, 
                                  dfPerCapita=NULL, 
                                  useClusters=readFromRDS("ctp_hier7_210126")$useClusters, 
                                  skipAssessmentPlots=TRUE
                                  )
## 
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv
## 
## -- Column specification --------------------------------------------------------
## cols(
##   submission_date = col_character(),
##   state = col_character(),
##   tot_cases = col_double(),
##   conf_cases = col_double(),
##   prob_cases = col_double(),
##   new_case = col_double(),
##   pnew_case = col_double(),
##   tot_death = col_double(),
##   conf_death = col_double(),
##   prob_death = col_double(),
##   new_death = col_double(),
##   pnew_death = col_double(),
##   created_at = col_character(),
##   consent_cases = col_character(),
##   consent_deaths = col_character()
## )
## 
## *** File has been checked for uniqueness by: state date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: naconf
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 18
## 
## Checking for similarity of: state
## In reference but not in current: 
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
##          date       name newValue refValue absDelta   pctDelta
## 1  2020-01-23  tot_cases       47        1       46 1.91666667
## 2  2020-01-22  tot_cases       46        1       45 1.91489362
## 3  2020-01-25  tot_cases       51        2       49 1.84905660
## 4  2020-01-24  tot_cases       48        2       46 1.84000000
## 5  2020-01-30  tot_cases       59        5       54 1.68750000
## 6  2020-01-29  tot_cases       57        5       52 1.67741935
## 7  2020-01-28  tot_cases       56        5       51 1.67213115
## 8  2020-01-27  tot_cases       55        5       50 1.66666667
## 9  2020-01-26  tot_cases       54        5       49 1.66101695
## 10 2020-01-31  tot_cases       63        7       56 1.60000000
## 11 2020-02-02  tot_cases       67        8       59 1.57333333
## 12 2020-02-01  tot_cases       66        8       58 1.56756757
## 13 2020-02-05  tot_cases       76       11       65 1.49425287
## 14 2020-02-06  tot_cases       76       11       65 1.49425287
## 15 2020-02-04  tot_cases       75       11       64 1.48837209
## 16 2020-02-10  tot_cases       81       12       69 1.48387097
## 17 2020-02-03  tot_cases       74       11       63 1.48235294
## 18 2020-02-09  tot_cases       80       12       68 1.47826087
## 19 2020-02-15  tot_cases       91       14       77 1.46666667
## 20 2020-02-16  tot_cases       91       14       77 1.46666667
## 21 2020-02-08  tot_cases       78       12       66 1.46666667
## 22 2020-02-12  tot_cases       84       13       71 1.46391753
## 23 2020-02-07  tot_cases       77       12       65 1.46067416
## 24 2020-02-11  tot_cases       83       13       70 1.45833333
## 25 2020-02-14  tot_cases       89       14       75 1.45631068
## 26 2020-02-13  tot_cases       86       14       72 1.44000000
## 27 2020-02-17  tot_cases       97       16       81 1.43362832
## 28 2020-02-18  tot_cases       97       16       81 1.43362832
## 29 2020-02-19  tot_cases       98       17       81 1.40869565
## 30 2020-02-20  tot_cases      101       18       83 1.39495798
## 31 2020-02-21  tot_cases      111       23       88 1.31343284
## 32 2020-02-24  tot_cases      118       25       93 1.30069930
## 33 2020-02-23  tot_cases      114       25       89 1.28057554
## 34 2020-02-22  tot_cases      113       25       88 1.27536232
## 35 2020-02-25  tot_cases      123       29       94 1.23684211
## 36 2020-02-26  tot_cases      127       31       96 1.21518987
## 37 2020-02-27  tot_cases      138       35      103 1.19075145
## 38 2020-02-28  tot_cases      146       39      107 1.15675676
## 39 2020-02-29  tot_cases      165       54      111 1.01369863
## 40 2020-03-01  tot_cases      189       63      126 1.00000000
## 41 2020-03-02  tot_cases      230       95      135 0.83076923
## 42 2020-03-03  tot_cases      268      123      145 0.74168798
## 43 2020-03-04  tot_cases      306      152      154 0.67248908
## 44 2020-03-05  tot_cases      385      222      163 0.53706755
## 45 2020-03-06  tot_cases      451      279      172 0.47123288
## 46 2020-03-07  tot_cases      542      357      185 0.41156841
## 47 2020-03-08  tot_cases      722      521      201 0.32341110
## 48 2020-03-09  tot_cases      988      759      229 0.26216371
## 49 2020-03-10  tot_cases     1337     1084      253 0.20900454
## 50 2020-03-11  tot_cases     1682     1398      284 0.18441558
## 51 2020-03-14  tot_cases     3086     2606      480 0.16865777
## 52 2020-03-13  tot_cases     2586     2194      392 0.16401674
## 53 2020-03-12  tot_cases     2180     1857      323 0.16001982
## 54 2020-03-16  tot_cases     5372     4705      667 0.13238067
## 55 2020-03-15  tot_cases     4406     3872      534 0.12901667
## 56 2020-03-17  tot_cases     8424     7603      821 0.10245211
## 57 2020-03-18  tot_cases    12100    11098     1002 0.08638676
## 58 2020-03-19  tot_cases    17089    15934     1155 0.06995125
## 59 2020-03-20  tot_cases    20729    19375     1354 0.06752444
## 60 2020-03-21  tot_cases    26741    25233     1508 0.05802901
## 61 2020-09-07 new_deaths      268      232       36 0.14400000
## 62 2021-02-10 new_deaths     3193     3455      262 0.07882070
## 63 2021-02-03 new_deaths     3330     3583      253 0.07319543
## 64 2020-10-05 new_deaths      473      445       28 0.06100218
## 65 2021-01-27 new_deaths     3706     3922      216 0.05663346
## 66 2020-07-19 new_deaths      745      704       41 0.05659075
## 67 2020-09-13 new_deaths      554      524       30 0.05565863
## 68 2020-11-11 new_deaths     1013     1068       55 0.05285920
## 69 2021-02-02 new_deaths     3180     3351      171 0.05236564
## 70 2021-04-02 new_deaths      771      812       41 0.05180038
## 71 2020-10-04 new_deaths      438      416       22 0.05152225
## 72 2020-01-22  new_cases       46        1       45 1.91489362
## 73 2020-02-27  new_cases       11        4        7 0.93333333
## 74 2020-03-01  new_cases       24        9       15 0.90909091
## 75 2020-03-03  new_cases       38       28       10 0.30303030
## 76 2020-03-04  new_cases       38       29        9 0.26865672
## 77 2020-03-02  new_cases       41       32        9 0.24657534
## 78 2020-03-14  new_cases      500      412       88 0.19298246
## 79 2020-03-13  new_cases      406      337       69 0.18573351
## 80 2020-03-07  new_cases       91       78       13 0.15384615
## 81 2020-03-16  new_cases      966      833      133 0.14785992
## 82 2020-03-06  new_cases       66       57        9 0.14634146
## 83 2020-03-05  new_cases       79       70        9 0.12080537
## 84 2020-03-09  new_cases      266      238       28 0.11111111
## 85 2020-09-08  new_cases    26186    23701     2485 0.09962515
## 86 2021-04-11  new_cases    54381    49409     4972 0.09580884
## 87 2020-03-11  new_cases      345      314       31 0.09408194
## 88 2020-03-08  new_cases      180      164       16 0.09302326
## 89 2020-03-12  new_cases      498      459       39 0.08150470
## 90 2021-04-12  new_cases    57236    61526     4290 0.07224533
## 91 2020-03-10  new_cases      349      325       24 0.07121662
## 92 2020-07-06  new_cases    50177    46972     3205 0.06598112
## 93 2020-03-20  new_cases     3640     3441      199 0.05620675
## 94 2020-03-17  new_cases     3052     2898      154 0.05176471
## 95 2020-08-24  new_cases    35040    33287     1753 0.05131207
## 96 2020-09-12  new_cases    39698    41758     2060 0.05057945
## 97 2020-03-18  new_cases     3676     3495      181 0.05048110

## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
##    state       name newValue refValue absDelta    pctDelta
## 1     AL tot_deaths  1784530  1415458   369072 0.230670865
## 2     AK tot_deaths    52934    42347    10587 0.222226887
## 3     SC tot_deaths  1482611  1479011     3600 0.002431100
## 4     AL  tot_cases 84345774 80518977  3826797 0.046423471
## 5     MO  tot_cases 88597603 87112443  1485160 0.016904668
## 6     AK  tot_cases  8345553  8302618    42935 0.005157924
## 7     AK new_deaths      341      310       31 0.095238095
## 8     AL new_deaths    10832    10712      120 0.011139993
## 9     SC new_deaths     9370     9279       91 0.009759236
## 10    RI new_deaths     2644     2638        6 0.002271867
## 11    AL  new_cases   522889   512950     9939 0.019190241
## 12    MO  new_cases   574786   584874    10088 0.017398203
## 13    RI  new_cases   142859   141097     1762 0.012410373
## 14    SC  new_cases   564337   563304     1033 0.001832143
## 
## 
## 
## Raw file for cdcDaily:
## Rows: 27,900
## Columns: 15
## $ date           <date> 2021-04-01, 2020-10-15, 2021-03-16, 2021-04-16, 202...
## $ state          <chr> "RI", "DC", "MN", "CT", "DC", "NJ", "NE", "CA", "VA"...
## $ tot_cases      <dbl> 138406, 16166, 498926, 329062, 0, 29, 18346, 10, 201...
## $ conf_cases     <dbl> NA, NA, NA, 302775, NA, NA, NA, NA, 183455, 28473, 0...
## $ prob_cases     <dbl> NA, NA, NA, 26287, NA, NA, NA, NA, 18506, 1282, 0, N...
## $ new_cases      <dbl> 428, 34, 708, 1062, 0, 6, 125, 0, 1161, 203, 0, 0, 2...
## $ pnew_case      <dbl> 0, 0, 152, 208, NA, NA, 0, NA, 191, 11, 0, NA, 180, ...
## $ tot_deaths     <dbl> 2627, 638, 6817, 7995, 0, 1, 260, 0, 3800, 742, 0, 0...
## $ conf_death     <dbl> NA, NA, 6380, 6578, NA, NA, NA, NA, 3527, 652, 0, NA...
## $ prob_death     <dbl> NA, NA, 437, 1417, NA, NA, NA, NA, 273, 90, 0, NA, 2...
## $ new_deaths     <dbl> 2, 0, 2, 5, 0, 0, 3, 0, 1, 3, 0, 0, 4, 23, 32, 0, 27...
## $ pnew_death     <dbl> 0, 0, 0, 1, NA, NA, 0, NA, 0, 0, 0, NA, 3, 0, 9, 0, ...
## $ created_at     <chr> "04/02/2021 01:56:15 PM", "10/16/2020 01:33:55 PM", ...
## $ consent_cases  <chr> "Not agree", NA, "N/A", "Agree", NA, "Agree", "Not a...
## $ consent_deaths <chr> "N/A", NA, "Agree", "Agree", NA, "Agree", "Not agree...
## 
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_h_downloaded_210509.csv
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   state = col_character(),
##   date = col_date(format = ""),
##   geocoded_state = col_character()
## )
## i Use `spec()` for the full column specifications.

## 
## *** File has been checked for uniqueness by: state date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: previous_day_admission_adult_covid_confirmed_18-19 previous_day_admission_adult_covid_confirmed_18-19_coverage previous_day_admission_adult_covid_confirmed_20-29 previous_day_admission_adult_covid_confirmed_20-29_coverage previous_day_admission_adult_covid_confirmed_30-39 previous_day_admission_adult_covid_confirmed_30-39_coverage previous_day_admission_adult_covid_confirmed_40-49 previous_day_admission_adult_covid_confirmed_40-49_coverage previous_day_admission_adult_covid_confirmed_50-59 previous_day_admission_adult_covid_confirmed_50-59_coverage previous_day_admission_adult_covid_confirmed_60-69 previous_day_admission_adult_covid_confirmed_60-69_coverage previous_day_admission_adult_covid_confirmed_70-79 previous_day_admission_adult_covid_confirmed_70-79_coverage previous_day_admission_adult_covid_confirmed_80+ previous_day_admission_adult_covid_confirmed_80+_coverage previous_day_admission_adult_covid_confirmed_unknown previous_day_admission_adult_covid_confirmed_unknown_coverage previous_day_admission_adult_covid_suspected_18-19 previous_day_admission_adult_covid_suspected_18-19_coverage previous_day_admission_adult_covid_suspected_20-29 previous_day_admission_adult_covid_suspected_20-29_coverage previous_day_admission_adult_covid_suspected_30-39 previous_day_admission_adult_covid_suspected_30-39_coverage previous_day_admission_adult_covid_suspected_40-49 previous_day_admission_adult_covid_suspected_40-49_coverage previous_day_admission_adult_covid_suspected_50-59 previous_day_admission_adult_covid_suspected_50-59_coverage previous_day_admission_adult_covid_suspected_60-69 previous_day_admission_adult_covid_suspected_60-69_coverage previous_day_admission_adult_covid_suspected_70-79 previous_day_admission_adult_covid_suspected_70-79_coverage previous_day_admission_adult_covid_suspected_80+ previous_day_admission_adult_covid_suspected_80+_coverage previous_day_admission_adult_covid_suspected_unknown previous_day_admission_adult_covid_suspected_unknown_coverage
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 15
## 
## Checking for similarity of: state
## In reference but not in current: 
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
##         date       name newValue refValue absDelta   pctDelta
## 1 2021-01-14        inp   142273   133668     8605 0.06236840
## 2 2020-07-25   hosp_ped     4902     4336      566 0.12253735
## 3 2020-08-02   hosp_ped     4701     4337      364 0.08054879
## 4 2020-07-17   hosp_ped     2011     1892      119 0.06097873
## 5 2021-03-24   hosp_ped     1079     1026       53 0.05035629
## 6 2021-01-14 hosp_adult   140687   132084     8603 0.06307855

## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
##    state       name newValue refValue absDelta    pctDelta
## 1     PR        inp   135016   130999     4017 0.030201304
## 2     SC        inp   368699   377405     8706 0.023337229
## 3     GA        inp  1055773  1047043     8730 0.008303152
## 4     LA        inp   364157   361954     2203 0.006067943
## 5     FL        inp  1963727  1952430    11297 0.005769432
## 6     TX        inp  2615064  2601359    13705 0.005254559
## 7     CA        inp  2841732  2829799    11933 0.004208035
## 8     IA        inp   163388   162946      442 0.002708881
## 9     NJ        inp   761481   762557     1076 0.001412038
## 10    PA        inp  1034474  1033289     1185 0.001146166
## 11    NM        inp   130285   130424      139 0.001066323
## 12    NH        inp    45742    45789       47 0.001026974
## 13    PR   hosp_ped     9420     7358     2062 0.245798069
## 14    NH   hosp_ped      220      263       43 0.178053830
## 15    SC   hosp_ped     2563     3000      437 0.157109473
## 16    ID   hosp_ped     1089     1025       64 0.060548723
## 17    TN   hosp_ped     7385     7027      358 0.049680822
## 18    KS   hosp_ped     1493     1424       69 0.047308879
## 19    FL   hosp_ped    52139    50218     1921 0.037535293
## 20    NV   hosp_ped     1695     1748       53 0.030787104
## 21    AL   hosp_ped     6322     6512      190 0.029608851
## 22    WV   hosp_ped     1827     1794       33 0.018227009
## 23    AR   hosp_ped     5537     5441       96 0.017489525
## 24    AZ   hosp_ped    10672    10503      169 0.015962220
## 25    SD   hosp_ped     2290     2318       28 0.012152778
## 26    VA   hosp_ped     5760     5826       66 0.011393061
## 27    UT   hosp_ped     2880     2848       32 0.011173184
## 28    PA   hosp_ped    16945    16759      186 0.011037266
## 29    MS   hosp_ped     2956     2987       31 0.010432442
## 30    NC   hosp_ped     8953     8866       87 0.009764858
## 31    MD   hosp_ped     4140     4106       34 0.008246423
## 32    IA   hosp_ped     1848     1836       12 0.006514658
## 33    NM   hosp_ped     2636     2621       15 0.005706677
## 34    ME   hosp_ped      433      431        2 0.004629630
## 35    MA   hosp_ped     4613     4593       20 0.004344992
## 36    KY   hosp_ped     4669     4689       20 0.004274418
## 37    OK   hosp_ped     9707     9667       40 0.004129245
## 38    CO   hosp_ped     7961     7929       32 0.004027690
## 39    HI   hosp_ped      537      539        2 0.003717472
## 40    OH   hosp_ped    17830    17895       65 0.003638908
## 41    LA   hosp_ped     2755     2745       10 0.003636364
## 42    NJ   hosp_ped     8196     8225       29 0.003532063
## 43    CA   hosp_ped    27980    27908       72 0.002576582
## 44    CT   hosp_ped     2046     2041        5 0.002446782
## 45    GA   hosp_ped    19391    19436       45 0.002317975
## 46    MN   hosp_ped     4967     4976        9 0.001810319
## 47    AK   hosp_ped      735      736        1 0.001359619
## 48    MT   hosp_ped      778      777        1 0.001286174
## 49    SC hosp_adult   301602   309856     8254 0.026997766
## 50    PR hosp_adult   104384   102818     1566 0.015115684
## 51    GA hosp_adult   750223   741448     8775 0.011765329
## 52    LA hosp_adult   287590   285397     2193 0.007654624
## 53    TX hosp_adult  2136776  2123121    13655 0.006410953
## 54    FL hosp_adult  1430350  1421859     8491 0.005953982
## 55    CA hosp_adult  2320857  2309180    11677 0.005044020
## 56    IA hosp_adult   135268   134838      430 0.003183935
## 57    NJ hosp_adult   546309   547336     1027 0.001878123
## 58    NM hosp_adult   104180   104333      153 0.001467534
## 59    ME hosp_adult    29610    29648       38 0.001282527
## 60    PA hosp_adult   818062   817063      999 0.001221925
## 61    DE hosp_adult    60210    60279       69 0.001145333
## 62    KY hosp_adult   276785   276488      297 0.001073611
## 63    RI hosp_adult    63607    63539       68 0.001069636
## 
## 
## 
## Raw file for cdcHosp:
## Rows: 23,230
## Columns: 97
## $ state                                                                        <chr> ...
## $ date                                                                         <date> ...
## $ critical_staffing_shortage_today_yes                                         <dbl> ...
## $ critical_staffing_shortage_today_no                                          <dbl> ...
## $ critical_staffing_shortage_today_not_reported                                <dbl> ...
## $ critical_staffing_shortage_anticipated_within_week_yes                       <dbl> ...
## $ critical_staffing_shortage_anticipated_within_week_no                        <dbl> ...
## $ critical_staffing_shortage_anticipated_within_week_not_reported              <dbl> ...
## $ hospital_onset_covid                                                         <dbl> ...
## $ hospital_onset_covid_coverage                                                <dbl> ...
## $ inpatient_beds                                                               <dbl> ...
## $ inpatient_beds_coverage                                                      <dbl> ...
## $ inpatient_beds_used                                                          <dbl> ...
## $ inpatient_beds_used_coverage                                                 <dbl> ...
## $ inp                                                                          <dbl> ...
## $ inpatient_beds_used_covid_coverage                                           <dbl> ...
## $ previous_day_admission_adult_covid_confirmed                                 <dbl> ...
## $ previous_day_admission_adult_covid_confirmed_coverage                        <dbl> ...
## $ previous_day_admission_adult_covid_suspected                                 <dbl> ...
## $ previous_day_admission_adult_covid_suspected_coverage                        <dbl> ...
## $ previous_day_admission_pediatric_covid_confirmed                             <dbl> ...
## $ previous_day_admission_pediatric_covid_confirmed_coverage                    <dbl> ...
## $ previous_day_admission_pediatric_covid_suspected                             <dbl> ...
## $ previous_day_admission_pediatric_covid_suspected_coverage                    <dbl> ...
## $ staffed_adult_icu_bed_occupancy                                              <dbl> ...
## $ staffed_adult_icu_bed_occupancy_coverage                                     <dbl> ...
## $ staffed_icu_adult_patients_confirmed_and_suspected_covid                     <dbl> ...
## $ staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage            <dbl> ...
## $ staffed_icu_adult_patients_confirmed_covid                                   <dbl> ...
## $ staffed_icu_adult_patients_confirmed_covid_coverage                          <dbl> ...
## $ hosp_adult                                                                   <dbl> ...
## $ total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage     <dbl> ...
## $ total_adult_patients_hospitalized_confirmed_covid                            <dbl> ...
## $ total_adult_patients_hospitalized_confirmed_covid_coverage                   <dbl> ...
## $ hosp_ped                                                                     <dbl> ...
## $ total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage <dbl> ...
## $ total_pediatric_patients_hospitalized_confirmed_covid                        <dbl> ...
## $ total_pediatric_patients_hospitalized_confirmed_covid_coverage               <dbl> ...
## $ total_staffed_adult_icu_beds                                                 <dbl> ...
## $ total_staffed_adult_icu_beds_coverage                                        <dbl> ...
## $ inpatient_beds_utilization                                                   <dbl> ...
## $ inpatient_beds_utilization_coverage                                          <dbl> ...
## $ inpatient_beds_utilization_numerator                                         <dbl> ...
## $ inpatient_beds_utilization_denominator                                       <dbl> ...
## $ percent_of_inpatients_with_covid                                             <dbl> ...
## $ percent_of_inpatients_with_covid_coverage                                    <dbl> ...
## $ percent_of_inpatients_with_covid_numerator                                   <dbl> ...
## $ percent_of_inpatients_with_covid_denominator                                 <dbl> ...
## $ inpatient_bed_covid_utilization                                              <dbl> ...
## $ inpatient_bed_covid_utilization_coverage                                     <dbl> ...
## $ inpatient_bed_covid_utilization_numerator                                    <dbl> ...
## $ inpatient_bed_covid_utilization_denominator                                  <dbl> ...
## $ adult_icu_bed_covid_utilization                                              <dbl> ...
## $ adult_icu_bed_covid_utilization_coverage                                     <dbl> ...
## $ adult_icu_bed_covid_utilization_numerator                                    <dbl> ...
## $ adult_icu_bed_covid_utilization_denominator                                  <dbl> ...
## $ adult_icu_bed_utilization                                                    <dbl> ...
## $ adult_icu_bed_utilization_coverage                                           <dbl> ...
## $ adult_icu_bed_utilization_numerator                                          <dbl> ...
## $ adult_icu_bed_utilization_denominator                                        <dbl> ...
## $ geocoded_state                                                               <chr> ...
## $ `previous_day_admission_adult_covid_confirmed_18-19`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_18-19_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_20-29`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_20-29_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_30-39`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_30-39_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_40-49`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_40-49_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_50-59`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_50-59_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_60-69`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_60-69_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_70-79`                         <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_70-79_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_80+`                           <dbl> ...
## $ `previous_day_admission_adult_covid_confirmed_80+_coverage`                  <dbl> ...
## $ previous_day_admission_adult_covid_confirmed_unknown                         <dbl> ...
## $ previous_day_admission_adult_covid_confirmed_unknown_coverage                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_18-19`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_18-19_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_20-29`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_20-29_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_30-39`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_30-39_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_40-49`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_40-49_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_50-59`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_50-59_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_60-69`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_60-69_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_70-79`                         <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_70-79_coverage`                <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_80+`                           <dbl> ...
## $ `previous_day_admission_adult_covid_suspected_80+_coverage`                  <dbl> ...
## $ previous_day_admission_adult_covid_suspected_unknown                         <dbl> ...
## $ previous_day_admission_adult_covid_suspected_unknown_coverage                <dbl> ...
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 6
##   isType tot_cases tot_deaths new_cases   new_deaths         n
##   <chr>      <dbl>      <dbl>     <dbl>        <dbl>     <dbl>
## 1 before   5.08e+9    1.07e+8   3.21e+7 558830       27435    
## 2 after    5.06e+9    1.06e+8   3.19e+7 556355       23715    
## 3 pctchg   4.40e-3    3.81e-3   4.47e-3      0.00443     0.136
## 
## 
## Processed for cdcDaily:
## Rows: 23,715
## Columns: 6
## $ date       <date> 2021-04-01, 2020-10-15, 2021-03-16, 2021-04-16, 2020-02...
## $ state      <chr> "RI", "DC", "MN", "CT", "DC", "NJ", "NE", "CA", "VA", "D...
## $ tot_cases  <dbl> 138406, 16166, 498926, 329062, 0, 29, 18346, 10, 201961,...
## $ tot_deaths <dbl> 2627, 638, 6817, 7995, 0, 1, 260, 0, 3800, 742, 0, 7055,...
## $ new_cases  <dbl> 428, 34, 708, 1062, 0, 6, 125, 0, 1161, 203, 0, 290, 252...
## $ new_deaths <dbl> 2, 0, 2, 5, 0, 0, 3, 0, 1, 3, 0, 4, 23, 32, 27, 0, 2, 21...
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 5
##   isType     inp hosp_adult    hosp_ped          n
##   <chr>    <dbl>      <dbl>       <dbl>      <dbl>
## 1 before 2.57e+7    1.99e+7 436353      23230     
## 2 after  2.56e+7    1.98e+7 426239      22395     
## 3 pctchg 5.60e-3    5.66e-3      0.0232     0.0359
## 
## 
## Processed for cdcHosp:
## Rows: 22,395
## Columns: 5
## $ date       <date> 2020-07-27, 2020-07-26, 2020-07-25, 2020-07-24, 2020-07...
## $ state      <chr> "ND", "ND", "ND", "NE", "ND", "HI", "ME", "MD", "OR", "N...
## $ inp        <dbl> 59, 45, 43, 219, 46, 20, 67, 1179, 149, 94, 3061, 95, 20...
## $ hosp_adult <dbl> NA, NA, NA, 202, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ hosp_ped   <dbl> NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## 
## Integrated per capita data file:
## Rows: 24,285
## Columns: 23
## $ date       <date> 2020-01-01, 2020-01-01, 2020-01-01, 2020-01-01, 2020-01...
## $ state      <chr> "AL", "HI", "IN", "LA", "MN", "MT", "NC", "TX", "AL", "H...
## $ tot_cases  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tot_deaths <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ new_cases  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ new_deaths <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ inp        <dbl> NA, 0, 0, NA, 0, 0, 0, 0, NA, 0, 0, NA, 0, 0, 0, 1877, 0...
## $ hosp_adult <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ hosp_ped   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tcpm       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tdpm       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ cpm        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ dpm        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ hpm        <dbl> NA, 0.0000, 0.0000, NA, 0.0000, 0.0000, 0.0000, 0.0000, ...
## $ ahpm       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ phpm       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tcpm7      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tdpm7      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ cpm7       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ dpm7       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ hpm7       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ahpm7      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ phpm7      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
cdc_daily_test
## $stateData
## # A tibble: 51 x 3
##    state name                      pop
##    <chr> <chr>                   <dbl>
##  1 AL    Alabama               4903185
##  2 AK    Alaska                 731545
##  3 AZ    Arizona               7278717
##  4 AR    Arkansas              3017804
##  5 CA    California           39512223
##  6 CO    Colorado              5758736
##  7 CT    Connecticut           3565287
##  8 DE    Delaware               973764
##  9 DC    District of Columbia   705749
## 10 FL    Florida              21477737
## # ... with 41 more rows
## 
## $dfRaw
## $dfRaw$cdcDaily
## # A tibble: 27,900 x 15
##    date       state tot_cases conf_cases prob_cases new_cases pnew_case
##    <date>     <chr>     <dbl>      <dbl>      <dbl>     <dbl>     <dbl>
##  1 2021-04-01 RI       138406         NA         NA       428         0
##  2 2020-10-15 DC        16166         NA         NA        34         0
##  3 2021-03-16 MN       498926         NA         NA       708       152
##  4 2021-04-16 CT       329062     302775      26287      1062       208
##  5 2020-02-14 DC            0         NA         NA         0        NA
##  6 2020-03-12 NJ           29         NA         NA         6        NA
##  7 2020-06-25 NE        18346         NA         NA       125         0
##  8 2020-02-24 CA           10         NA         NA         0        NA
##  9 2020-11-14 VA       201961     183455      18506      1161       191
## 10 2020-11-17 DE        29755      28473       1282       203        11
## # ... with 27,890 more rows, and 8 more variables: tot_deaths <dbl>,
## #   conf_death <dbl>, prob_death <dbl>, new_deaths <dbl>, pnew_death <dbl>,
## #   created_at <chr>, consent_cases <chr>, consent_deaths <chr>
## 
## $dfRaw$cdcHosp
## # A tibble: 23,230 x 97
##    state date       critical_staffi~ critical_staffi~ critical_staffi~
##    <chr> <date>                <dbl>            <dbl>            <dbl>
##  1 ND    2020-07-27                0                0               52
##  2 ND    2020-07-26                0                0               52
##  3 ND    2020-07-25                0                0               52
##  4 NE    2020-07-24                3               30               65
##  5 ND    2020-07-21                0                0               52
##  6 VI    2020-07-10                0                0                2
##  7 HI    2020-07-09                0                0               30
##  8 VI    2020-07-03                0                0                2
##  9 ME    2020-06-24                0                0               36
## 10 MD    2020-06-12                0                0               56
## # ... with 23,220 more rows, and 92 more variables:
## #   critical_staffing_shortage_anticipated_within_week_yes <dbl>,
## #   critical_staffing_shortage_anticipated_within_week_no <dbl>,
## #   critical_staffing_shortage_anticipated_within_week_not_reported <dbl>,
## #   hospital_onset_covid <dbl>, hospital_onset_covid_coverage <dbl>,
## #   inpatient_beds <dbl>, inpatient_beds_coverage <dbl>,
## #   inpatient_beds_used <dbl>, inpatient_beds_used_coverage <dbl>, inp <dbl>,
## #   inpatient_beds_used_covid_coverage <dbl>,
## #   previous_day_admission_adult_covid_confirmed <dbl>,
## #   previous_day_admission_adult_covid_confirmed_coverage <dbl>,
## #   previous_day_admission_adult_covid_suspected <dbl>,
## #   previous_day_admission_adult_covid_suspected_coverage <dbl>,
## #   previous_day_admission_pediatric_covid_confirmed <dbl>,
## #   previous_day_admission_pediatric_covid_confirmed_coverage <dbl>,
## #   previous_day_admission_pediatric_covid_suspected <dbl>,
## #   previous_day_admission_pediatric_covid_suspected_coverage <dbl>,
## #   staffed_adult_icu_bed_occupancy <dbl>,
## #   staffed_adult_icu_bed_occupancy_coverage <dbl>,
## #   staffed_icu_adult_patients_confirmed_and_suspected_covid <dbl>,
## #   staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage <dbl>,
## #   staffed_icu_adult_patients_confirmed_covid <dbl>,
## #   staffed_icu_adult_patients_confirmed_covid_coverage <dbl>,
## #   hosp_adult <dbl>,
## #   total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage <dbl>,
## #   total_adult_patients_hospitalized_confirmed_covid <dbl>,
## #   total_adult_patients_hospitalized_confirmed_covid_coverage <dbl>,
## #   hosp_ped <dbl>,
## #   total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage <dbl>,
## #   total_pediatric_patients_hospitalized_confirmed_covid <dbl>,
## #   total_pediatric_patients_hospitalized_confirmed_covid_coverage <dbl>,
## #   total_staffed_adult_icu_beds <dbl>,
## #   total_staffed_adult_icu_beds_coverage <dbl>,
## #   inpatient_beds_utilization <dbl>,
## #   inpatient_beds_utilization_coverage <dbl>,
## #   inpatient_beds_utilization_numerator <dbl>,
## #   inpatient_beds_utilization_denominator <dbl>,
## #   percent_of_inpatients_with_covid <dbl>,
## #   percent_of_inpatients_with_covid_coverage <dbl>,
## #   percent_of_inpatients_with_covid_numerator <dbl>,
## #   percent_of_inpatients_with_covid_denominator <dbl>,
## #   inpatient_bed_covid_utilization <dbl>,
## #   inpatient_bed_covid_utilization_coverage <dbl>,
## #   inpatient_bed_covid_utilization_numerator <dbl>,
## #   inpatient_bed_covid_utilization_denominator <dbl>,
## #   adult_icu_bed_covid_utilization <dbl>,
## #   adult_icu_bed_covid_utilization_coverage <dbl>,
## #   adult_icu_bed_covid_utilization_numerator <dbl>,
## #   adult_icu_bed_covid_utilization_denominator <dbl>,
## #   adult_icu_bed_utilization <dbl>, adult_icu_bed_utilization_coverage <dbl>,
## #   adult_icu_bed_utilization_numerator <dbl>,
## #   adult_icu_bed_utilization_denominator <dbl>, geocoded_state <chr>,
## #   `previous_day_admission_adult_covid_confirmed_18-19` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_18-19_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_20-29` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_20-29_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_30-39` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_30-39_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_40-49` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_40-49_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_50-59` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_50-59_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_60-69` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_60-69_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_70-79` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_70-79_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_80+` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_80+_coverage` <dbl>,
## #   previous_day_admission_adult_covid_confirmed_unknown <dbl>,
## #   previous_day_admission_adult_covid_confirmed_unknown_coverage <dbl>,
## #   `previous_day_admission_adult_covid_suspected_18-19` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_18-19_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_20-29` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_20-29_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_30-39` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_30-39_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_40-49` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_40-49_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_50-59` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_50-59_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_60-69` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_60-69_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_70-79` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_70-79_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_80+` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_80+_coverage` <dbl>,
## #   previous_day_admission_adult_covid_suspected_unknown <dbl>,
## #   previous_day_admission_adult_covid_suspected_unknown_coverage <dbl>
## 
## 
## $dfProcess
## $dfProcess$cdcDaily
## # A tibble: 23,715 x 6
##    date       state tot_cases tot_deaths new_cases new_deaths
##    <date>     <chr>     <dbl>      <dbl>     <dbl>      <dbl>
##  1 2021-04-01 RI       138406       2627       428          2
##  2 2020-10-15 DC        16166        638        34          0
##  3 2021-03-16 MN       498926       6817       708          2
##  4 2021-04-16 CT       329062       7995      1062          5
##  5 2020-02-14 DC            0          0         0          0
##  6 2020-03-12 NJ           29          1         6          0
##  7 2020-06-25 NE        18346        260       125          3
##  8 2020-02-24 CA           10          0         0          0
##  9 2020-11-14 VA       201961       3800      1161          1
## 10 2020-11-17 DE        29755        742       203          3
## # ... with 23,705 more rows
## 
## $dfProcess$cdcHosp
## # A tibble: 22,395 x 5
##    date       state   inp hosp_adult hosp_ped
##    <date>     <chr> <dbl>      <dbl>    <dbl>
##  1 2020-07-27 ND       59         NA       NA
##  2 2020-07-26 ND       45         NA       NA
##  3 2020-07-25 ND       43         NA       NA
##  4 2020-07-24 NE      219        202        0
##  5 2020-07-21 ND       46         NA       NA
##  6 2020-07-09 HI       20         NA       NA
##  7 2020-06-24 ME       67         NA       NA
##  8 2020-06-12 MD     1179         NA       NA
##  9 2020-06-10 OR      149         NA       NA
## 10 2020-06-08 NH       94         NA       NA
## # ... with 22,385 more rows
## 
## 
## $dfPerCapita
## # A tibble: 24,285 x 23
##    date       state tot_cases tot_deaths new_cases new_deaths   inp hosp_adult
##    <date>     <chr>     <dbl>      <dbl>     <dbl>      <dbl> <dbl>      <dbl>
##  1 2020-01-01 AL           NA         NA        NA         NA    NA         NA
##  2 2020-01-01 HI           NA         NA        NA         NA     0         NA
##  3 2020-01-01 IN           NA         NA        NA         NA     0         NA
##  4 2020-01-01 LA           NA         NA        NA         NA    NA         NA
##  5 2020-01-01 MN           NA         NA        NA         NA     0         NA
##  6 2020-01-01 MT           NA         NA        NA         NA     0         NA
##  7 2020-01-01 NC           NA         NA        NA         NA     0         NA
##  8 2020-01-01 TX           NA         NA        NA         NA     0         NA
##  9 2020-01-02 AL           NA         NA        NA         NA    NA         NA
## 10 2020-01-02 HI           NA         NA        NA         NA     0         NA
## # ... with 24,275 more rows, and 15 more variables: hosp_ped <dbl>, tcpm <dbl>,
## #   tdpm <dbl>, cpm <dbl>, dpm <dbl>, hpm <dbl>, ahpm <dbl>, phpm <dbl>,
## #   tcpm7 <dbl>, tdpm7 <dbl>, cpm7 <dbl>, dpm7 <dbl>, hpm7 <dbl>, ahpm7 <dbl>,
## #   phpm7 <dbl>
## 
## $useClusters
## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS 
##  1  2  2  2  3  1  4  5  5  2  2  7  3  1  3  3  1  1  5  4  5  7  5  1  1  2 
## MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 
##  1  3  8  1  3  4  3  2  4  1  1  1  3  5  2  8  2  2  1  3  7  3  1  1  1 
## 
## $plotData
## NULL
## 
## $consolidatedPlotData
## NULL
## 
## $clCum
## NULL
identical(cdc_daily_test$dfPerCapita, dfPerCap_210502_v2)
## [1] TRUE

The function appears to be working as intended. Next steps are to better enable the external log file processing and to update the assessment capability.

The updated function is used to create clusters using the integrated data, with an external log file used:

readList <- list("cdcDaily"="./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv", 
                 "cdcHosp"="./RInputFiles/Coronavirus/CDC_h_downloaded_210509.csv"
                 )

cdc_daily_test_v2 <- readRunCDCDaily(thruLabel="May 2, 2021", 
                                     readFrom=readList, 
                                     compareFile=list("cdcDaily"=colRenamer(dfRaw_dc_210414,
                                                                            c('new_case'='new_cases', 
                                                                              'tot_death'='tot_deaths',
                                                                              'new_death'='new_deaths'
                                                                              )
                                                                            ), 
                                                      "cdcHosp"=dfHosp_old
                                                      ), 
                                     writeLog="./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log", 
                                     ovrwriteLog=TRUE, 
                                     dfPerCapita=NULL, 
                                     useClusters=NULL, 
                                     hierarchical=NA, 
                                     returnList=TRUE,
                                     shapeFunc=customTimeBucket, 
                                     minShape="2020-04", 
                                     maxShape="2021-03", 
                                     hmlSegs=3, 
                                     eslSegs=3, 
                                     reAssignState=list("VT"="NH"),
                                     seed=2104221520,
                                     skipAssessmentPlots=TRUE
                                     )
## 
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv
## 
## -- Column specification --------------------------------------------------------
## cols(
##   submission_date = col_character(),
##   state = col_character(),
##   tot_cases = col_double(),
##   conf_cases = col_double(),
##   prob_cases = col_double(),
##   new_case = col_double(),
##   pnew_case = col_double(),
##   tot_death = col_double(),
##   conf_death = col_double(),
##   prob_death = col_double(),
##   new_death = col_double(),
##   pnew_death = col_double(),
##   created_at = col_character(),
##   consent_cases = col_character(),
##   consent_deaths = col_character()
## )
## 
## *** File has been checked for uniqueness by: state date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: naconf
## In current but not in reference: 
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 18
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log
## 
## Checking for similarity of: state
## In reference but not in current: 
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
## 97 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log

## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
## 14 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log
## 
## 
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_h_downloaded_210509.csv
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   state = col_character(),
##   date = col_date(format = ""),
##   geocoded_state = col_character()
## )
## i Use `spec()` for the full column specifications.

## 
## *** File has been checked for uniqueness by: state date

## 
## 
## Checking for similarity of: column names
## In reference but not in current: 
## In current but not in reference: previous_day_admission_adult_covid_confirmed_18-19 previous_day_admission_adult_covid_confirmed_18-19_coverage previous_day_admission_adult_covid_confirmed_20-29 previous_day_admission_adult_covid_confirmed_20-29_coverage previous_day_admission_adult_covid_confirmed_30-39 previous_day_admission_adult_covid_confirmed_30-39_coverage previous_day_admission_adult_covid_confirmed_40-49 previous_day_admission_adult_covid_confirmed_40-49_coverage previous_day_admission_adult_covid_confirmed_50-59 previous_day_admission_adult_covid_confirmed_50-59_coverage previous_day_admission_adult_covid_confirmed_60-69 previous_day_admission_adult_covid_confirmed_60-69_coverage previous_day_admission_adult_covid_confirmed_70-79 previous_day_admission_adult_covid_confirmed_70-79_coverage previous_day_admission_adult_covid_confirmed_80+ previous_day_admission_adult_covid_confirmed_80+_coverage previous_day_admission_adult_covid_confirmed_unknown previous_day_admission_adult_covid_confirmed_unknown_coverage previous_day_admission_adult_covid_suspected_18-19 previous_day_admission_adult_covid_suspected_18-19_coverage previous_day_admission_adult_covid_suspected_20-29 previous_day_admission_adult_covid_suspected_20-29_coverage previous_day_admission_adult_covid_suspected_30-39 previous_day_admission_adult_covid_suspected_30-39_coverage previous_day_admission_adult_covid_suspected_40-49 previous_day_admission_adult_covid_suspected_40-49_coverage previous_day_admission_adult_covid_suspected_50-59 previous_day_admission_adult_covid_suspected_50-59_coverage previous_day_admission_adult_covid_suspected_60-69 previous_day_admission_adult_covid_suspected_60-69_coverage previous_day_admission_adult_covid_suspected_70-79 previous_day_admission_adult_covid_suspected_70-79_coverage previous_day_admission_adult_covid_suspected_80+ previous_day_admission_adult_covid_suspected_80+_coverage previous_day_admission_adult_covid_suspected_unknown previous_day_admission_adult_covid_suspected_unknown_coverage
## 
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 15
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log
## 
## Checking for similarity of: state
## In reference but not in current: 
## In current but not in reference:

## 
## 
## ***Differences of at least 5 and at least 5%
## 
## 6 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log

## 
## 
## ***Differences of at least 0 and at least 0.1%
## 
## 63 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v001.log
## 
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 6
##   isType tot_cases tot_deaths new_cases   new_deaths         n
##   <chr>      <dbl>      <dbl>     <dbl>        <dbl>     <dbl>
## 1 before   5.08e+9    1.07e+8   3.21e+7 558830       27435    
## 2 after    5.06e+9    1.06e+8   3.19e+7 556355       23715    
## 3 pctchg   4.40e-3    3.81e-3   4.47e-3      0.00443     0.136
## 
## 
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 5
##   isType     inp hosp_adult    hosp_ped          n
##   <chr>    <dbl>      <dbl>       <dbl>      <dbl>
## 1 before 2.57e+7    1.99e+7 436353      23230     
## 2 after  2.56e+7    1.98e+7 426239      22395     
## 3 pctchg 5.60e-3    5.66e-3      0.0232     0.0359
cdc_daily_test_v2
## $stateData
## # A tibble: 51 x 3
##    state name                      pop
##    <chr> <chr>                   <dbl>
##  1 AL    Alabama               4903185
##  2 AK    Alaska                 731545
##  3 AZ    Arizona               7278717
##  4 AR    Arkansas              3017804
##  5 CA    California           39512223
##  6 CO    Colorado              5758736
##  7 CT    Connecticut           3565287
##  8 DE    Delaware               973764
##  9 DC    District of Columbia   705749
## 10 FL    Florida              21477737
## # ... with 41 more rows
## 
## $dfRaw
## $dfRaw$cdcDaily
## # A tibble: 27,900 x 15
##    date       state tot_cases conf_cases prob_cases new_cases pnew_case
##    <date>     <chr>     <dbl>      <dbl>      <dbl>     <dbl>     <dbl>
##  1 2021-04-01 RI       138406         NA         NA       428         0
##  2 2020-10-15 DC        16166         NA         NA        34         0
##  3 2021-03-16 MN       498926         NA         NA       708       152
##  4 2021-04-16 CT       329062     302775      26287      1062       208
##  5 2020-02-14 DC            0         NA         NA         0        NA
##  6 2020-03-12 NJ           29         NA         NA         6        NA
##  7 2020-06-25 NE        18346         NA         NA       125         0
##  8 2020-02-24 CA           10         NA         NA         0        NA
##  9 2020-11-14 VA       201961     183455      18506      1161       191
## 10 2020-11-17 DE        29755      28473       1282       203        11
## # ... with 27,890 more rows, and 8 more variables: tot_deaths <dbl>,
## #   conf_death <dbl>, prob_death <dbl>, new_deaths <dbl>, pnew_death <dbl>,
## #   created_at <chr>, consent_cases <chr>, consent_deaths <chr>
## 
## $dfRaw$cdcHosp
## # A tibble: 23,230 x 97
##    state date       critical_staffi~ critical_staffi~ critical_staffi~
##    <chr> <date>                <dbl>            <dbl>            <dbl>
##  1 ND    2020-07-27                0                0               52
##  2 ND    2020-07-26                0                0               52
##  3 ND    2020-07-25                0                0               52
##  4 NE    2020-07-24                3               30               65
##  5 ND    2020-07-21                0                0               52
##  6 VI    2020-07-10                0                0                2
##  7 HI    2020-07-09                0                0               30
##  8 VI    2020-07-03                0                0                2
##  9 ME    2020-06-24                0                0               36
## 10 MD    2020-06-12                0                0               56
## # ... with 23,220 more rows, and 92 more variables:
## #   critical_staffing_shortage_anticipated_within_week_yes <dbl>,
## #   critical_staffing_shortage_anticipated_within_week_no <dbl>,
## #   critical_staffing_shortage_anticipated_within_week_not_reported <dbl>,
## #   hospital_onset_covid <dbl>, hospital_onset_covid_coverage <dbl>,
## #   inpatient_beds <dbl>, inpatient_beds_coverage <dbl>,
## #   inpatient_beds_used <dbl>, inpatient_beds_used_coverage <dbl>, inp <dbl>,
## #   inpatient_beds_used_covid_coverage <dbl>,
## #   previous_day_admission_adult_covid_confirmed <dbl>,
## #   previous_day_admission_adult_covid_confirmed_coverage <dbl>,
## #   previous_day_admission_adult_covid_suspected <dbl>,
## #   previous_day_admission_adult_covid_suspected_coverage <dbl>,
## #   previous_day_admission_pediatric_covid_confirmed <dbl>,
## #   previous_day_admission_pediatric_covid_confirmed_coverage <dbl>,
## #   previous_day_admission_pediatric_covid_suspected <dbl>,
## #   previous_day_admission_pediatric_covid_suspected_coverage <dbl>,
## #   staffed_adult_icu_bed_occupancy <dbl>,
## #   staffed_adult_icu_bed_occupancy_coverage <dbl>,
## #   staffed_icu_adult_patients_confirmed_and_suspected_covid <dbl>,
## #   staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage <dbl>,
## #   staffed_icu_adult_patients_confirmed_covid <dbl>,
## #   staffed_icu_adult_patients_confirmed_covid_coverage <dbl>,
## #   hosp_adult <dbl>,
## #   total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage <dbl>,
## #   total_adult_patients_hospitalized_confirmed_covid <dbl>,
## #   total_adult_patients_hospitalized_confirmed_covid_coverage <dbl>,
## #   hosp_ped <dbl>,
## #   total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage <dbl>,
## #   total_pediatric_patients_hospitalized_confirmed_covid <dbl>,
## #   total_pediatric_patients_hospitalized_confirmed_covid_coverage <dbl>,
## #   total_staffed_adult_icu_beds <dbl>,
## #   total_staffed_adult_icu_beds_coverage <dbl>,
## #   inpatient_beds_utilization <dbl>,
## #   inpatient_beds_utilization_coverage <dbl>,
## #   inpatient_beds_utilization_numerator <dbl>,
## #   inpatient_beds_utilization_denominator <dbl>,
## #   percent_of_inpatients_with_covid <dbl>,
## #   percent_of_inpatients_with_covid_coverage <dbl>,
## #   percent_of_inpatients_with_covid_numerator <dbl>,
## #   percent_of_inpatients_with_covid_denominator <dbl>,
## #   inpatient_bed_covid_utilization <dbl>,
## #   inpatient_bed_covid_utilization_coverage <dbl>,
## #   inpatient_bed_covid_utilization_numerator <dbl>,
## #   inpatient_bed_covid_utilization_denominator <dbl>,
## #   adult_icu_bed_covid_utilization <dbl>,
## #   adult_icu_bed_covid_utilization_coverage <dbl>,
## #   adult_icu_bed_covid_utilization_numerator <dbl>,
## #   adult_icu_bed_covid_utilization_denominator <dbl>,
## #   adult_icu_bed_utilization <dbl>, adult_icu_bed_utilization_coverage <dbl>,
## #   adult_icu_bed_utilization_numerator <dbl>,
## #   adult_icu_bed_utilization_denominator <dbl>, geocoded_state <chr>,
## #   `previous_day_admission_adult_covid_confirmed_18-19` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_18-19_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_20-29` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_20-29_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_30-39` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_30-39_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_40-49` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_40-49_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_50-59` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_50-59_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_60-69` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_60-69_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_70-79` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_70-79_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_80+` <dbl>,
## #   `previous_day_admission_adult_covid_confirmed_80+_coverage` <dbl>,
## #   previous_day_admission_adult_covid_confirmed_unknown <dbl>,
## #   previous_day_admission_adult_covid_confirmed_unknown_coverage <dbl>,
## #   `previous_day_admission_adult_covid_suspected_18-19` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_18-19_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_20-29` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_20-29_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_30-39` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_30-39_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_40-49` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_40-49_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_50-59` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_50-59_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_60-69` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_60-69_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_70-79` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_70-79_coverage` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_80+` <dbl>,
## #   `previous_day_admission_adult_covid_suspected_80+_coverage` <dbl>,
## #   previous_day_admission_adult_covid_suspected_unknown <dbl>,
## #   previous_day_admission_adult_covid_suspected_unknown_coverage <dbl>
## 
## 
## $dfProcess
## $dfProcess$cdcDaily
## # A tibble: 23,715 x 6
##    date       state tot_cases tot_deaths new_cases new_deaths
##    <date>     <chr>     <dbl>      <dbl>     <dbl>      <dbl>
##  1 2021-04-01 RI       138406       2627       428          2
##  2 2020-10-15 DC        16166        638        34          0
##  3 2021-03-16 MN       498926       6817       708          2
##  4 2021-04-16 CT       329062       7995      1062          5
##  5 2020-02-14 DC            0          0         0          0
##  6 2020-03-12 NJ           29          1         6          0
##  7 2020-06-25 NE        18346        260       125          3
##  8 2020-02-24 CA           10          0         0          0
##  9 2020-11-14 VA       201961       3800      1161          1
## 10 2020-11-17 DE        29755        742       203          3
## # ... with 23,705 more rows
## 
## $dfProcess$cdcHosp
## # A tibble: 22,395 x 5
##    date       state   inp hosp_adult hosp_ped
##    <date>     <chr> <dbl>      <dbl>    <dbl>
##  1 2020-07-27 ND       59         NA       NA
##  2 2020-07-26 ND       45         NA       NA
##  3 2020-07-25 ND       43         NA       NA
##  4 2020-07-24 NE      219        202        0
##  5 2020-07-21 ND       46         NA       NA
##  6 2020-07-09 HI       20         NA       NA
##  7 2020-06-24 ME       67         NA       NA
##  8 2020-06-12 MD     1179         NA       NA
##  9 2020-06-10 OR      149         NA       NA
## 10 2020-06-08 NH       94         NA       NA
## # ... with 22,385 more rows
## 
## 
## $dfPerCapita
## # A tibble: 24,285 x 23
##    date       state tot_cases tot_deaths new_cases new_deaths   inp hosp_adult
##    <date>     <chr>     <dbl>      <dbl>     <dbl>      <dbl> <dbl>      <dbl>
##  1 2020-01-01 AL           NA         NA        NA         NA    NA         NA
##  2 2020-01-01 HI           NA         NA        NA         NA     0         NA
##  3 2020-01-01 IN           NA         NA        NA         NA     0         NA
##  4 2020-01-01 LA           NA         NA        NA         NA    NA         NA
##  5 2020-01-01 MN           NA         NA        NA         NA     0         NA
##  6 2020-01-01 MT           NA         NA        NA         NA     0         NA
##  7 2020-01-01 NC           NA         NA        NA         NA     0         NA
##  8 2020-01-01 TX           NA         NA        NA         NA     0         NA
##  9 2020-01-02 AL           NA         NA        NA         NA    NA         NA
## 10 2020-01-02 HI           NA         NA        NA         NA     0         NA
## # ... with 24,275 more rows, and 15 more variables: hosp_ped <dbl>, tcpm <dbl>,
## #   tdpm <dbl>, cpm <dbl>, dpm <dbl>, hpm <dbl>, ahpm <dbl>, phpm <dbl>,
## #   tcpm7 <dbl>, tdpm7 <dbl>, cpm7 <dbl>, dpm7 <dbl>, hpm7 <dbl>, ahpm7 <dbl>,
## #   phpm7 <dbl>
## 
## $useClusters
## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS 
##  4  3  3  3  6  4  2  5  5  6  3  9  1  4  1  1  1  6  3  2  5  9  5  4  4  3 
## MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 
##  4  6  1  4  9  2  1  3  2  6  1  9  3  2  3  1  3  6  4  6  9  9  4  6  4 
## 
## $plotData
## NULL
## 
## $consolidatedPlotData
## NULL
## 
## $clCum
## NULL
identical(cdc_daily_test_v2$dfPerCapita, cdc_daily_test$dfPerCapita)
## [1] TRUE
tibble::tibble(state=names(cdc_daily_test_v2$useClusters), 
               cluster=factor(unname(cdc_daily_test_v2$useClusters))
               ) %>%
    usmap::plot_usmap(regions="states", values="cluster", data=.) + 
    labs(title="Rules-based clusters using CDC daily data") + 
    scale_fill_discrete("Cluster")

The updated function appears to work as intended, including producing rules-based clusters. Next steps are to update the cluster assessment algorithms, to account for differences in variable names and interpretations compared with the previous process.

An updated process should include:

  1. Create cluster-level aggregates (e.g., population-weighted mean or unweighted median)
  2. Create overall summary plots such as maps, burden over time, total burden, and the like
  3. Create facetted plots over time that allow for multiple metrics to be displayed simultaneously
  4. Create cumulative burden plots over time

A function is written to create cluster-level aggregates:

# Function to join any number of frames
joinFrames <- function(..., fnJoin=dplyr::inner_join, keyJoin=NULL) {
    
    # FUNCtION ARGUMENTS:
    # ...: data frames to be joined using purrr::reduce OR a length-1 list of data frames
    # fnJoin: dplyr function for join (full_join, inner_join, left_join, right_join), applied in order to ...
    # keyJoin: by= variable passed to fnJoin (NULL means anything that matches, the dplyr default)

    # Allow for ... to be already passed as a list
    # Convert ... to a list provisionally, then extract only element 1 if it is already a list
    x <- list(...)
    if (length(x)==1) {
        if ("list" %in% class(x[[1]])) 
            x <- x[[1]]
    }
    
    # Join the files and return
    purrr::reduce(.x=x, .f=fnJoin, by=keyJoin)
    
}

# Function to convert a named vector to a tibble
vecToTibble <- function(x, 
                        colNameData="value", 
                        colNameName="name"
                        ) {
    
    # FUNCTION ARGUMENTS:
    # x: a vector (typically would be named, otherwise rownames are 1:length(x))
    # colNameData: the column name in the tibble for the vector data
    # colNameName: the column name in the tibble for the vector names
    
    as.data.frame(x) %>%
        rownames_to_column(var=colNameName) %>%
        tibble::as_tibble() %>%
        purrr::set_names(c(colNameName, colNameData))
    
}

# Function to convert a regular function to one that returns NA for all NA and na.rm=TRUE otherwise
specNA <- function(fn=sum) function(x, ...) if (any(!is.na(x))) fn(x, ..., na.rm=TRUE) else NA

# Function to create a weighted sum (similar to weighted.mean, but not normalizing or dividing by weights)
specSumProd <- function(x, w=1, na.rm=FALSE) {

    # FUNCTION ARGUMENTS:
    # x: a numeric vector x
    # w: a weighting vector w, which should be either of length(x) or of length 1
    # na.rm: boolean, should missing vectors in x (and their corresponding weight) be excluded?
    
    # Check that x has an appropriate length
    if (!(length(w) %in% c(1, length(x)))) stop("\nParameter w must be either length 1 or same length as x\n")
    
    # If there are any missing values in x, manage x and w appropriately
    if (any(is.na(x))) {
        if(!isTRUE(na.rm)) return(NA)  # if there are missing values and na.rm is not TRUE, result is NA
        else {
            if(length(w) > 1) w <- w[!is.na(x)]  # eliminate the values of w that correspond to NA in x
            x <- x[!is.na(x)]
        }
    }
    
    # Return the sum of the product of x and w (note that if x is of length 0, this will return 0)
    return(sum(x*w))
    
}

# Function to create a group aggregate
createGroupAgg <- function(df, 
                           aggTo=c(),
                           aggVars=NULL,
                           aggFunc=specNA(mean),
                           wtVar=NULL, 
                           prefix=NULL
                           ) {
    
    # FUNCtION ARGUMENTS:
    # df: a data frame or tibble containing data suitable for aggregation
    # aggTo: level data will be aggregated to in the final result (c() means overall level)
    #        passed as character vector, such as c("cluster", "date") for aggregating to cluster-date
    # aggVars: variables to be aggregated (NULL means aggregate all that are of type is.numeric)
    # aggFunc: a function that can be used for aggregation to the aggTo level
    # wtVar: the weighting variable for the aggregation function (NULL means equal weigthing by record)
    #        passed as character, so "pop" means that values should be weighted by pop
    # prefix: a character prefix to be prepended to column names in summarize(across(...))
    #         NULL means use default naming convention

    # If aggVars has been passed as NULL, apply to all variables of type is.numeric
    # Ensure aggVars does not include any grouping variables
    if (is.null(aggVars)) aggVars <- sapply(df, is.numeric) %>% which() %>% names()
    aggVars <- setdiff(aggVars, aggTo)
    
    # Group by aggTo, then apply aggFunc using do.call passing w=get(wtVar) when wtVar is not NULL
    dfGroup <- df %>%
        group_by(across(.cols=all_of(aggTo))) %>%
        summarize(across(.cols=all_of(aggVars), 
                         .fns=~do.call(aggFunc, 
                                       append(list(x=.x), if(!is.null(wtVar)) list(w=get(wtVar)) else NULL)
                                       ),
                         .names=if(is.null(prefix)) NULL else paste0(prefix, "{.col}")
                         ), 
                  .groups="drop"
                  )
    
    # Return dfGroup
    dfGroup
    
}

The functions are tested using existing data, with aggregated medians created by cluster-date:

# Testing the function for median by cluster-date
dfAgg <- vecToTibble(cdc_daily_test_v2$useClusters, colNameData="cluster", colNameName="state") %>%
    joinFrames(cdc_daily_test_v2$stateData, cdc_daily_test_v2$dfPerCapita, keyJoin="state") %>%
    createGroupAgg(aggTo=c("cluster", "date"), 
                   aggFunc=specNA(median), 
                   aggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7")
                   )

# Check a plot of dfAgg
dfAgg %>%
    pivot_longer(-c("cluster", "date")) %>%
    mutate(name=factor(name, 
                       levels=c("cpm7", "dpm7", "hpm7", "tcpm7", "tdpm7"), 
                       labels=c("New cases", "New deaths", "Hospitalized", "Total cases", "Total deaths")
                       )
           ) %>%
    ggplot(aes(x=date, y=value)) +
    geom_line(aes(group=factor(cluster), color=factor(cluster))) +
    facet_grid(name ~ cluster, scales="free_y") + 
    labs(title="Median rolling-7 mean by date for cluster and metric", 
         subtitle="Includes only non-NA rolling-7 mean states on date", 
         x="", 
         y="Median burden per million"
         ) + 
    scale_color_discrete("Cluster") + 
    scale_x_date(date_breaks="3 months", date_labels="%b %Y") + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=8))
## Warning: Removed 203 row(s) containing missing values (geom_path).

Aggregated medians appear reasonable. Next steps are to enable weighting by population, paricularly for averaging or summing.

The function can now use weighted.mean. Example code includes:

# Testing the function for median by cluster-date
dfAgg_wm <- vecToTibble(cdc_daily_test_v2$useClusters, colNameData="cluster", colNameName="state") %>%
    joinFrames(cdc_daily_test_v2$stateData, cdc_daily_test_v2$dfPerCapita, keyJoin="state") %>%
    createGroupAgg(aggTo=c("cluster", "date"), 
                   aggFunc=specNA(weighted.mean), 
                   aggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7"), 
                   wtVar="pop"
                   )

# Testing the function for median by cluster-date
dfAgg_mu <- vecToTibble(cdc_daily_test_v2$useClusters, colNameData="cluster", colNameName="state") %>%
    joinFrames(cdc_daily_test_v2$stateData, cdc_daily_test_v2$dfPerCapita, keyJoin="state") %>%
    createGroupAgg(aggTo=c("cluster", "date"), 
                   aggFunc=specNA(mean), # run the unweighted mean
                   aggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7")
                   )

# Check a plot of dfAgg
dfAgg_wm %>%
    bind_rows(dfAgg_mu, dfAgg, .id="aggType") %>%
    pivot_longer(-c("cluster", "date", "aggType")) %>%
    mutate(name=factor(name, 
                       levels=c("cpm7", "dpm7", "hpm7", "tcpm7", "tdpm7"), 
                       labels=c("New cases", "New deaths", "Hospitalized", "Total cases", "Total deaths")
                       ), 
           aggType=factor(aggType, levels=c("1", "2", "3"), labels=c("weighted", "unweighted", "median"))
           ) %>%
    ggplot(aes(x=date, y=value)) +
    geom_line(aes(group=aggType, color=aggType)) +
    facet_grid(name ~ cluster, scales="free_y") + 
    labs(title="Rolling-7 mean by date for cluster and metric", 
         subtitle="Weighted mean, unweighted mean, unweighted median", 
         x="", 
         y="Mean 7-day burden per million"
         ) + 
    scale_color_discrete("Cluster") + 
    scale_x_date(date_breaks="3 months", date_labels="%b %Y") + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=8))
## Warning: Removed 105 row(s) containing missing values (geom_path).

There is reasonable alignment among the curves, regardless of choice of mean/median and weighted/unweighted. This is an area to potentially return to, so that the weighting process is not forced to use weighted.mean and could allow for other functions such as sumproduct.

The function createGroupAgg is updated to use do.call(), allowing for w=get(wtVar) to be optionally passed to the aggregation function. The results of the updated function are now included above.

Further, a function for creating sum-products has been created, so that the total reported amount of burden by cluster can be tracked:

# Testing the function for median by cluster-date
dfAgg_sumprod <- vecToTibble(cdc_daily_test_v2$useClusters, colNameData="cluster", colNameName="state") %>%
    joinFrames(cdc_daily_test_v2$stateData, cdc_daily_test_v2$dfPerCapita, keyJoin="state") %>%
    mutate(popMill=pop/1000000, popTot=1) %>%
    createGroupAgg(aggTo=c("cluster", "date"), 
                   aggFunc=specNA(specSumProd), 
                   aggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7", "popTot"), 
                   wtVar="popMill"
                   )

# Check a plot of dfAgg_sumprod
dfAgg_sumprod %>%
    pivot_longer(-c("cluster", "date")) %>%
    mutate(name=factor(name, 
                       levels=c("cpm7", "dpm7", "hpm7", "tcpm7", "tdpm7", "popTot"), 
                       labels=c("New cases", "New deaths", "Hospitalized", 
                                "Total cases", "Total deaths", "Pop (mln)"
                                )
                       )
           ) %>%
    ggplot(aes(x=date, y=value)) +
    geom_line(aes(color=factor(cluster))) +
    facet_grid(name ~ cluster, scales="free_y") + 
    labs(title="Rolling-7 total by date for cluster and metric", 
         x="", 
         y="Total 7-day burden"
         ) + 
    scale_color_discrete("Cluster") + 
    scale_x_date(date_breaks="3 months", date_labels="%b %Y") + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=8))
## Warning: Removed 126 row(s) containing missing values (geom_path).

The aggregation functions appear to be working as intended. Next steps are to create overall summary-level plots for the data (maps, new burden, total burden, etc.).

Functions are created to take an aggregation file and to plot 1) a map, 2) the trend of any aggregate over time, or 3) the most-recent value of any aggregate at a point in time:

# Helper function to make a summary map
helperSummaryMap <- function(df, 
                             mapLevel="states", 
                             keyCol="state",
                             values="cluster",
                             discreteValues=NULL,
                             legend.position="right",
                             labelScale=TRUE,
                             extraArgs=list(),
                             ...
                             ) {
    
    # FUNCTION ARGUMENTS:
    # df: a data frame containing a level of geography and an associated cluster
    # mapLevel: a parameter for whether the map is "states" or "counties"
    # keyCol: the key column for plotting (usmap::plot_usmap is particular, and this must be 'state' or 'fips')
    # values: the character name of the field containing the data to be plotted
    # discreteValues: boolean for whether the values are discrete (if not, use continuous)
    #                 NULL means infer from data
    # legend.position: character for the location of the legend in the plot
    # labelScale: boolean, should an scale_fill_ be created?  Use FALSE if contained in extraArgs
    # extraArgs: list of other arguments that will be appended as '+' to the end of the usmap::plot_usmap call
    # ...: other parameters to be passed to usmap::plot_usmap (e.g., labels, include, exclude, etc.)

    # Modify the data frame to contain only the relevant data
    df <- df %>%
        select(all_of(c(keyCol, values))) %>%
        distinct()
    
    # Determine the type of data being plotted
    if (is.null(discreteValues)) discreteValues <- !is.numeric(df[[values]])
    
    # Convert data type if needed
    if (isTRUE(discreteValues) & is.numeric(df[[values]])) 
        df[[values]] <- factor(df[[values]])
    
    # Create the map and return
    gg <- usmap::plot_usmap(regions=mapLevel, data=df, values=values, ...) + 
        theme(legend.position=legend.position)
    if (isTRUE(labelScale)) gg <- gg + 
        if(isTRUE(discreteValues)) scale_fill_discrete(values) else scale_fill_continuous(values)
    for (ctr in seq_along(extraArgs)) gg <- gg + extraArgs[[ctr]]
    
    # Return the map object
    gg
    
}


# Function to plot an aggregate over time
helperAggTrend <- function(df, 
                           aggVars,
                           xVar="date",
                           grpVar="cluster", 
                           xLab="", 
                           yLab="Rolling 7-day mean per million", 
                           title=paste0(yLab, " by cluster"), 
                           subtitle=NULL, 
                           linesize=0,
                           mapper=varMapper,
                           extraArgs=list()
                           ) {
    
    # FUNCTION ARGUMENTS:
    # df: a data frame containing the relevant aggregate
    # aggVars: the aggregate variables to be plotted
    # xVar: the x-variable to be plotted (must be in df)
    # grpVar: the grouping variable in df
    #         note that df should be unique by xVar-grpVar
    # xLab: the xlabel for the plot
    # yLab: the ylabel for the plot
    # title: the title for the plot
    # subtitle: the subtitle for the plot (NULL means no subtitle)
    # linesize: argument passed to geom_line(size=); zero will be the default thinnest line
    # mapper: mapping file for variable to descriptive name
    # extraArgs: list of other arguments that will be appended as '+' to the end of the ggplot call
    
    # Ensure that grpVar is not numeric (convert to cluster if needed)
    for (vrbl in grpVar) if(is.numeric(df[[vrbl]])) df[[vrbl]] <- factor(df[[vrbl]])
    
    # Create the aggregate plot
    gg <- df %>%
        select(all_of(c(xVar, grpVar, aggVars))) %>%
        pivot_longer(-all_of(c(xVar, grpVar))) %>%
        filter(!is.na(value)) %>%
        ggplot(aes_string(x=xVar, y="value")) +
        geom_line(aes_string(group=grpVar, color=grpVar), size=linesize) +
        facet_wrap(~mapper[name], scales="free_y") +
        labs(x=xLab, y=yLab, title=title)
    if (!is.null(subtitle)) gg <- gg + labs(subtitle=subtitle)
    for (ctr in seq_along(extraArgs)) gg <- gg + extraArgs[[ctr]]
    
    # Return the plot
    gg
    
}


# Function to plot an aggregate (non-temporal)
helperAggTotal <- function(df, 
                           aggVars,
                           grpVar="cluster", 
                           xLab="Cluster", 
                           yLab="Totals", 
                           title=paste0(yLab, " by cluster"), 
                           subtitle=NULL, 
                           divideBy=c(),
                           mapper=varMapper,
                           extraArgs=list()
                           ) {
    
    # FUNCTION ARGUMENTS:
    # df: a data frame containing the relevant aggregate
    # aggVars: the aggregate variables to be plotted
    # grpVar: the intended facetting variable in the plot
    # xLab: the xlabel for the plot
    # yLab: the ylabel for the plot
    # title: the title for the plot
    # subtitle: the subtitle for the plot (NULL means no subtitle)
    # divideBy: a named list of c("variable"=divisor) where variable will be divided by divisor
    #           not included in list means no divisor (will be treated as 1)
    # mapper: mapping file for variable to descriptive name
    # extraArgs: list of other arguments that will be appended as '+' to the end of the ggplot call
    
    # Ensure that grpVar is not numeric (convert to cluster if needed)
    for (vrbl in grpVar) if(is.numeric(df[[vrbl]])) df[[vrbl]] <- factor(df[[vrbl]])
    
    # Create the aggregate plot
    gg <- df %>%
        select(all_of(c(grpVar, aggVars))) %>%
        pivot_longer(-all_of(c(grpVar))) %>%
        filter(!is.na(value)) %>%
        group_by_at(all_of(c(grpVar, "name"))) %>%
        filter(row_number()==n()) %>%
        ungroup() %>%
        mutate(useValue=ifelse(name %in% names(divideBy), value/divideBy[name], value)) %>%
        ggplot(aes(x=fct_rev(get(grpVar)), y=useValue)) + 
        geom_col(aes_string(fill=all_of(grpVar))) + 
        geom_text(aes(y=useValue/2, label=round(useValue))) +
        coord_flip() +
        facet_wrap(~mapper[name], scales="free_x") +
        labs(x=xLab, y=yLab, title=title)
    if (!is.null(subtitle)) gg <- gg + labs(subtitle=subtitle)
    for (ctr in seq_along(extraArgs)) gg <- gg + extraArgs[[ctr]]
    
    # Return the plot
    gg
    
}

The functions are then tested:

dfFull <- vecToTibble(cdc_daily_test_v2$useClusters, colNameData="cluster", colNameName="state") %>%
    joinFrames(cdc_daily_test_v2$stateData, cdc_daily_test_v2$dfPerCapita, keyJoin="state") %>%
    mutate(popMill=pop/1000000, popTot=1)

# Create map by cluster
helperSummaryMap(dfFull, discreteValues=TRUE)

# Create trend of daily disease over time
helperAggTrend(dfAgg_wm, 
               aggVars=c("cpm7", "hpm7", "dpm7")
               )

# Create trend of total disease over time
helperAggTrend(dfAgg_wm, 
               aggVars=c("tcpm7", "tdpm7"), 
               yLab="Cumulative burden per million", 
               extraArgs=list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"), 
                              arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
                              )
               )

# Create population, cases, and deaths by cluster
helperAggTotal(dfAgg_sumprod, 
               aggVars=c("popTot", "tcpm7", "tdpm7"), 
               divideBy=c("tcpm7"=1000, "tdpm7"=1000),
               mapper=c("popTot"="Population (millions)", "tcpm7"="Cases (000)", "tdpm7"="Deaths (000)")
               )

The functions appear to be working as intended. Next steps are to create an overall summary page.

A function is written to create an overall summary page from a processed CDC daily file:

createSummary <- function(lst, 
                          brewPalette=NA
                          ) {
    
    # FUNCTION ARGUMENTS:
    # lst: a processed list file from readRunCDCDaily()
    # brewerPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
    
    # Create an integrated database including cluster data and population data    
    dfFull <- vecToTibble(lst[["useClusters"]], colNameData="cluster", colNameName="state") %>%
        joinFrames(lst[["stateData"]], lst[["dfPerCapita"]], keyJoin="state")

    # Create a four-quadrant summary of the segments, including:
    # 1. Map of segments
    # 2. Population of segments
    # 3. Burden per million of segments
    # 4. Trend-line of burden by segments
    
    # Create a map of the clusters
    p1 <- helperSummaryMap(dfFull, 
                           discreteValues=TRUE, 
                           labelScale=is.na(brewPalette), 
                           extraArgs=if(is.na(brewPalette)) list() else 
                               list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                           )
    
    # Create plot for population by cluster
    p2 <- dfFull %>%
        createGroupAgg(aggTo=c("cluster", "date"), 
                       aggFunc=specNA(specSumProd), 
                       aggVars=c("pop")
                       ) %>%
        helperAggTotal(aggVars=c("pop"), 
                       mapper=c("pop"="Population (millions)"), 
                       xLab="", 
                       yLab="", 
                       title="",
                       divideBy=c("pop"=1000000), 
                       extraArgs=if(is.na(brewPalette)) list() else 
                           list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                       )
    
    # Create weighted mean data for use in both total burden and burden over time plots
    dfAgg_wm <- dfFull %>%
        createGroupAgg(aggTo=c("cluster", "date"), 
                       aggFunc=specNA(weighted.mean), 
                       aggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7"), 
                       wtVar="pop"
                       )
    
    # Create total case and death burden per million by cluster plot
    p3 <- helperAggTotal(dfAgg_wm, 
                         aggVars=c("tcpm7", "tdpm7"), 
                         mapper=c("tcpm7"="Cases per million", "tdpm7"="Deaths per million"), 
                         xLab="", 
                         yLab="", 
                         title="",
                         extraArgs=if(is.na(brewPalette)) list() else 
                             list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                         )
    
    # Create plot for cumulative burden per million over time
    p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"), 
                   arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
                   )
    if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
    p4 <- helperAggTrend(dfAgg_wm, 
                         aggVars=c("tcpm7", "tdpm7"), 
                         yLab="",
                         title="Cumulative burden per million by cluster", 
                         linesize=0.75,
                         extraArgs=p4xtra
                         )

    gridExtra::grid.arrange(p1, p2, p4, p3)
    
}

The function is run for the CDC daily data, with the “Accent” palette:

createSummary(cdc_daily_test_v2, brewPalette="Paired")

The dashboard provides a decent summary of the segments at a glance. Next steps are to add a few additional detailed plots, such as trends over time by metric and segment.

The function is updated to allow for increased flexibility with parameters. Specifically, group aggregates are created outside the function, allowing the function to be focused on plotting:

# Function for creating an integrated data frame
integrateData <- function(lst=NULL, 
                          lstExtract=list("useClusters"=function(x) vecToTibble(x, 
                                                                                colNameData="cluster",
                                                                                colNameName="state"
                                                                                ), 
                                          "stateData"=function(x) colSelector(x, vecSelect=c("state", "pop")), 
                                          "dfPerCapita"=NULL
                                          ),
                          keyJoin="state",
                          fnJoin=dplyr::inner_join,
                          otherDF=list()
                          ) {
    
    # FUNCTION ARGUMENTS:
    # lst: a list containing one or more data frame elements
    # lstExtract: the elements to extract from lst with an optional function for converting the elements
    #             NULL means use the extracted element as-is
    # keyJoin: the merge key for the data frames
    # fnJoin: the merge function, applied as reduce starting with lstExtract and ending with ...
    # otherDF: list of additional data frames to be joined
    
    dfList <- list()
    
    # If a list has been provided, extract the relevant elements
    if (!is.null(lst)) {
        for (elem in names(lstExtract)) {
            dfList[[elem]] <- lst[[elem]]
            if (!(is.null(lstExtract[[elem]]))) dfList[[elem]] <- lstExtract[[elem]](dfList[[elem]])
        }
    }
    
    # Append any other data frames passed
    dfList <- append(dfList, otherDF)
    
    # Return a consolidated data frame
    joinFrames(dfList, fnJoin=fnJoin, keyJoin=keyJoin)
    
}

# Function for creating an aggregate-level frame
combineAggData <- function(df, 
                           aggTo=c("cluster", "date"), 
                           aggBy=list("agg1"=list(aggFunc=specNA(specSumProd), 
                                                  aggVars=c("pop"), 
                                                  wtVar=NULL, 
                                                  prefix=NULL
                                                  ), 
                                      "agg2"=list(aggFunc=specNA(weighted.mean), 
                                                  aggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7"), 
                                                  wtVar="pop", 
                                                  prefix="wm_"
                                                  )
                                      ),
                           fnJoin=dplyr::full_join
                           ) {

    # FUNCTION ARGUMENTS:
    # df: a data frame containing data for summarizing to an aggregate
    # aggTo: the level to which data should be aggregated
    # aggBy: a list of lists directing the creation of aggregates
    # fnJoin: a function for joining the relevant aggTo aggregates

    # Create an empty aggregation list
    aggList <- list()
    
    # Create the aggregates based on the instruction list
    # Potentially update later so that the instructions can be skipped when the defaults should be used
    for (ctr in seq_along(aggBy)) {
        aggList[[ctr]] <- df %>%
            createGroupAgg(aggTo=aggTo, 
                           aggFunc=aggBy[[ctr]]$aggFunc, 
                           aggVars=aggBy[[ctr]]$aggVars,
                           wtVar=aggBy[[ctr]]$wtVar, 
                           prefix=aggBy[[ctr]]$prefix
                           )
    }
    
    # Return the joined data
    joinFrames(aggList, fnJoin=fnJoin, keyJoin=aggTo)
    
}

createSummary <- function(df, 
                          stateClusterDF=NULL,
                          brewPalette=NA
                          ) {
    
    # FUNCTION ARGUMENTS:
    # df: an integrated data frame by cluster-date
    # stateClusterDF: a data frame containing state-cluster (NULL means it can be found in df)
    # brewerPalette: character string for a palette from RColorBrewer to be used (NA means default colors)

    # Create a four-quadrant summary of the segments, including:
    # 1. Map of segments
    # 2. Population of segments
    # 3. Burden per million of segments
    # 4. Trend-line of burden by segments
    
    # Create a map of the clusters
    p1 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF, 
                           discreteValues=TRUE, 
                           labelScale=is.na(brewPalette), 
                           extraArgs=if(is.na(brewPalette)) list() else 
                               list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                           )
    
    # Create plot for population by cluster
    p2 <- df %>%
        helperAggTotal(aggVars=c("pop"), 
                       mapper=c("pop"="Population (millions)"), 
                       xLab="", 
                       yLab="", 
                       title="",
                       divideBy=c("pop"=1000000), 
                       extraArgs=if(is.na(brewPalette)) list() else 
                           list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                       )
    
    # Create total case and death burden per million by cluster plot
    p3 <- df %>%
        helperAggTotal(aggVars=c("wm_tcpm7", "wm_tdpm7"), 
                       mapper=c("wm_tcpm7"="Cases per million", "wm_tdpm7"="Deaths per million"), 
                       xLab="", 
                       yLab="", 
                       title="",
                       extraArgs=if(is.na(brewPalette)) list() else 
                           list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
                       )
    
    # Create plot for cumulative burden per million over time
    p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"), 
                   arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
                   )
    if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
    p4 <- df %>%
        helperAggTrend(aggVars=c("wm_tcpm7", "wm_tdpm7"), 
                       mapper=c("wm_tcpm7"="Cases per million", "wm_tdpm7"="Deaths per million"),
                       yLab="",
                       title="Cumulative burden per million by cluster", 
                       linesize=0.75,
                       extraArgs=p4xtra
                       )

    gridExtra::grid.arrange(p1, p2, p4, p3)
    
}

The functions are tested:

# Use defaults and confirm that resulting database is the same
dfFull <- integrateData(cdc_daily_test_v2)
identical(dfFull, 
          integrateData(otherDF=list(cdc_daily_test_v2$useClusters %>% 
                                         vecToTibble(colNameData="cluster", colNameName="state"), 
                                     cdc_daily_test_v2$stateData %>% colSelector(vecSelect=c("state", "pop")), 
                                     cdc_daily_test_v2$dfPerCapita
                                     )
                        )
          )
## [1] TRUE
# Test the aggregation capability
dfFull <- integrateData(cdc_daily_test_v2) %>%
    combineAggData()
dfFull
## # A tibble: 3,416 x 8
##    cluster date           pop wm_tcpm7 wm_tdpm7 wm_cpm7 wm_dpm7 wm_hpm7
##      <dbl> <date>       <dbl>    <dbl>    <dbl>   <dbl>   <dbl>   <dbl>
##  1       1 2020-01-01 6732219       NA       NA      NA      NA      NA
##  2       1 2020-01-02 6732219       NA       NA      NA      NA      NA
##  3       1 2020-01-03 6732219       NA       NA      NA      NA      NA
##  4       1 2020-01-04 6732219       NA       NA      NA      NA       0
##  5       1 2020-01-05 6732219       NA       NA      NA      NA       0
##  6       1 2020-01-06 6732219       NA       NA      NA      NA       0
##  7       1 2020-01-07 6732219       NA       NA      NA      NA       0
##  8       1 2020-01-08 6732219       NA       NA      NA      NA       0
##  9       1 2020-01-09 6732219       NA       NA      NA      NA       0
## 10       1 2020-01-10 6732219       NA       NA      NA      NA       0
## # ... with 3,406 more rows
# Test the dashboard capability
createSummary(dfFull, 
              stateClusterDF=vecToTibble(cdc_daily_test_v2[["useClusters"]], 
                                         colNameData="cluster", 
                                         colNameName="state"
                                         ),
              brewPalette="Paired"
              )

The functions are now split, allowing for the use of dfFull in producing other summary statistics about the clusters.